# Pandas
used to work with tabular or heterogenous data<br>
Import convention for pandas

In [1]:
import pandas as pd

import `series` and `dataframe`. Both provide basis for all the applications

In [4]:
from pandas import Series, DataFrame

## Pandas Data Structure
### Series

one dimensional array like object contains
1. sequence of values
2. associated array of indexes
Shows values on the left and values on the right

In [2]:
import pandas as pd
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

we can also create a Series with an index identifying each data point with a label

In [5]:
obj2 = pd.Series([4, 7, -5, 3], index = ['d','b','a','c'])

In [6]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [9]:
obj2['a']

-5

In [10]:
obj2*2 

d     8
b    14
a   -10
c     6
dtype: int64

In [12]:
import numpy as np
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [13]:
'b' in obj2 # Can be treated like ordered dict

True

In [7]:
dictdata = {'Mumbai':35000, 'Delhi': 45000, 'Chennai': 34000, 'Kolkata': 29000}
obj3 = pd.Series(dictdata)

In [8]:
obj3

Mumbai     35000
Delhi      45000
Chennai    34000
Kolkata    29000
dtype: int64

You can change the Key Index by passing dict keys

In [10]:
cities = ['Mumbai', 'Chennai', 'Pune', 'Delhi']
obj4 = pd.Series(dictdata, index=cities)
obj4

Mumbai     35000.0
Chennai    34000.0
Pune           NaN
Delhi      45000.0
dtype: float64

It has mapped the values of three cities and marked Pune as `NaN` (Not a Number)<br>
`notnull` and `isnull` can detect missing values

In [11]:
pd.isnull(obj4)

Mumbai     False
Chennai    False
Pune        True
Delhi      False
dtype: bool

In [12]:
pd.notnull(obj4)

Mumbai      True
Chennai     True
Pune       False
Delhi       True
dtype: bool

Pandas Series align them selves to index label

In [13]:
obj3

Mumbai     35000
Delhi      45000
Chennai    34000
Kolkata    29000
dtype: int64

In [14]:
obj4

Mumbai     35000.0
Chennai    34000.0
Pune           NaN
Delhi      45000.0
dtype: float64

In [19]:
obj3+obj4

Chennai    68000.0
Delhi      90000.0
Kolkata        NaN
Mumbai     70000.0
Pune           NaN
dtype: float64

## DataFrame

DataFrame -  a rectangular table of data<br>
It has an ordered collection of columns<br>
Each column can be a different value type (numeric, string, boolean, etc.).<br>
The DataFrame has both a row and column index<br>
the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of
one-dimensional arrays

In [15]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada','Nevada','Nevada'],
       'year':[2000, 2001, 2003, 2000, 2001, 2002],
       'pop':[1.5, 1.6, 2.5, 3.9, 4.1, 6.7]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.6
2,Ohio,2003,2.5
3,Nevada,2000,3.9
4,Nevada,2001,4.1
5,Nevada,2002,6.7


In [16]:
discount=[1,2,3,4,5,7]
data['discount']=discount
pd.DataFrame(data)

Unnamed: 0,state,year,pop,discount
0,Ohio,2000,1.5,1
1,Ohio,2001,1.6,2
2,Ohio,2003,2.5,3
3,Nevada,2000,3.9,4
4,Nevada,2001,4.1,5
5,Nevada,2002,6.7,7


In [19]:
frame.head() #head method selects top 5 rows only

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.6
2,Ohio,2003,2.5
3,Nevada,2000,3.9
4,Nevada,2001,4.1


In [20]:
pd.DataFrame(data, columns=['year', 'state','pop']) #you can specify the sequence of columns

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.6
2,2003,Ohio,2.5
3,2000,Nevada,3.9
4,2001,Nevada,4.1
5,2002,Nevada,6.7


In [8]:
pd.DataFrame(data, columns=['year','state','pop','income']) #If a column is not in dict, NaN will appear

Unnamed: 0,year,state,pop,income
0,2000,Ohio,1.5,
1,2001,Ohio,1.6,
2,2003,Ohio,2.5,
3,2000,Nevada,3.9,
4,2001,Nevada,4.1,
5,2002,Nevada,6.7,


In [21]:
frame2 = pd.DataFrame(data, columns=['year','state','pop','income'], 
             index=['zero','one','two','three','four','five']) #to change the index column

In [22]:
frame2

Unnamed: 0,year,state,pop,income
zero,2000,Ohio,1.5,
one,2001,Ohio,1.6,
two,2003,Ohio,2.5,
three,2000,Nevada,3.9,
four,2001,Nevada,4.1,
five,2002,Nevada,6.7,


In [24]:
frame2['state'] #only one column can be retrieved

zero       Ohio
one        Ohio
two        Ohio
three    Nevada
four     Nevada
five     Nevada
Name: state, dtype: object

In [12]:
frame2.state #can be retrieved this way as well

zero       Ohio
one        Ohio
two        Ohio
three    Nevada
four     Nevada
five     Nevada
Name: state, dtype: object

In [13]:
frame2.year

zero     2000
one      2001
two      2003
three    2000
four     2001
five     2002
Name: year, dtype: int64

rows can also be retrieved using position

In [14]:
frame2.loc['three']

year        2000
state     Nevada
pop          3.9
income       NaN
Name: three, dtype: object

***
columns can be modified using assignments<br>

In [15]:
frame2['income'] = 18.5

In [16]:
frame2

Unnamed: 0,year,state,pop,income
zero,2000,Ohio,1.5,18.5
one,2001,Ohio,1.6,18.5
two,2003,Ohio,2.5,18.5
three,2000,Nevada,3.9,18.5
four,2001,Nevada,4.1,18.5
five,2002,Nevada,6.7,18.5


In [18]:
import numpy as np
frame2['income'] = np.arange(6.)

In [19]:
frame2

Unnamed: 0,year,state,pop,income
zero,2000,Ohio,1.5,0.0
one,2001,Ohio,1.6,1.0
two,2003,Ohio,2.5,2.0
three,2000,Nevada,3.9,3.0
four,2001,Nevada,4.1,4.0
five,2002,Nevada,6.7,5.0


In [25]:
val = pd.Series([1.2, 2.4, 3.6], index = ['one','three','five'])

In [26]:
frame2['income'] = val
frame2

Unnamed: 0,year,state,pop,income
zero,2000,Ohio,1.5,
one,2001,Ohio,1.6,1.2
two,2003,Ohio,2.5,
three,2000,Nevada,3.9,2.4
four,2001,Nevada,4.1,
five,2002,Nevada,6.7,3.6


Assigning a column that does not exists will create a new column

In [22]:
frame2['eastern'] = frame2['state'] == 'Ohio'
frame2

Unnamed: 0,year,state,pop,income,eastern
zero,2000,Ohio,1.5,,True
one,2001,Ohio,1.6,1.2,True
two,2003,Ohio,2.5,,True
three,2000,Nevada,3.9,2.4,False
four,2001,Nevada,4.1,,False
five,2002,Nevada,6.7,3.6,False


`del` method can delete the column

In [28]:
del frame2['eastern']

In [29]:
frame2

Unnamed: 0,year,state,pop,income
zero,2000,Ohio,1.5,
one,2001,Ohio,1.6,1.2
two,2003,Ohio,2.5,
three,2000,Nevada,3.9,2.4
four,2001,Nevada,4.1,
five,2002,Nevada,6.7,3.6


In [25]:
frame2.columns

Index(['year', 'state', 'pop', 'income'], dtype='object')

#### Nested Dict of Dicts
Outer dict key acts as column and nested dict as key

In [30]:
pop = {'Nevada':{2001: 2.4, 2002:5.6},
      'Ohio':{2000:4.5, 2001:3.4, 2002:6.7}}

In [31]:
pop

{'Nevada': {2001: 2.4, 2002: 5.6}, 'Ohio': {2000: 4.5, 2001: 3.4, 2002: 6.7}}

In [33]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,3.4
2002,5.6,6.7
2000,,4.5


In [34]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,5.6,
Ohio,3.4,6.7,4.5


If DataFrame index and columns has `name` attribute, that will be displayed 

In [35]:
frame3.index.name='year'
frame3.columns.name='state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,3.4
2002,5.6,6.7
2000,,4.5


In [36]:
frame3.values

array([[2.4, 3.4],
       [5.6, 6.7],
       [nan, 4.5]])

In [32]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.6, 1.2],
       [2003, 'Ohio', 2.5, nan],
       [2000, 'Nevada', 3.9, 2.4],
       [2001, 'Nevada', 4.1, nan],
       [2002, 'Nevada', 6.7, 3.6]], dtype=object)

### Index Objects

Any array or sequence of labels while contructing the Series or DataFrame is internally converted into index

In [36]:
obj = pd.Series(range(3), index=['a','b','c'])
obj

a    0
b    1
c    2
dtype: int64

In [37]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [38]:
index[1:]

Index(['b', 'c'], dtype='object')

Index are immutable cannot be modified

In [39]:
index[1]='d' # will give error

TypeError: Index does not support mutable operations

### Reindexing
`reindex` - Create a new object with the data conformed to a new index

In [42]:
obj = pd.Series([4.5, 6.7, 7.8, -5.8], index=['d','a','c','b'])

In [43]:
obj

d    4.5
a    6.7
c    7.8
b   -5.8
dtype: float64

In [44]:
obj2 = obj.reindex(['a','b','c','d','e'])

In [45]:
obj2

a    6.7
b   -5.8
c    7.8
d    4.5
e    NaN
dtype: float64

For timeseries data sometime it is important to interpolate the data or filling of the values while reindexing

In [46]:
obj3 = pd.Series(['blue','red','yellow'],index=[0,2,4])

In [47]:
obj3

0      blue
2       red
4    yellow
dtype: object

In [48]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2       red
3       red
4    yellow
5    yellow
dtype: object

In case of DataFrame, `reindex` can alter column, row or both

In [49]:
frame = pd.DataFrame(np.arange(9).reshape(3,3),
                     index=['a','c','d'],
                    columns=['Ohio','Texas','California'])

In [50]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [51]:
frame2 = frame.reindex(['a','b','c','d']) #reindex on row

In [52]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [54]:
states = ['Texas', 'California','Utah']
frame.reindex(columns=states)

Unnamed: 0,Texas,California,Utah
a,1,2,
c,4,5,
d,7,8,


### Dropping Entries
`drop` - use drop to return object with dropped values

In [55]:
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [57]:
n_obj = obj.drop('c')
n_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [59]:
obj.drop(['b','d'])

a    0.0
c    2.0
e    4.0
dtype: float64

In [60]:
data = pd.DataFrame(np.arange(16).reshape(4,4), 
                    index=['Ohio','Utah','California','Navada'],
                   columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Utah,4,5,6,7
California,8,9,10,11
Navada,12,13,14,15


In [61]:
data.drop(['Utah','Navada'])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
California,8,9,10,11


In [62]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Utah,4,6,7
California,8,10,11
Navada,12,14,15


In [63]:
data.drop(['two','four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Utah,4,6
California,8,10
Navada,12,14


### Indexing

Indexing in case of Pandas is similar to Numpy except you can use Series index value also

In [65]:
obj = pd.Series(np.arange(4.),index=['a','b','c','d'])
obj['a']

0.0

In [66]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [67]:
obj['b']

1.0

In [68]:
obj[1]

1.0

In [70]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [71]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

***
Slicing in label is different. In label, the end point is inclusive

In [72]:
obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

#### Indexing in DataFrame
Used to retrieve one or more columns than the values only

In [3]:
import numpy as np
data = pd.DataFrame(np.arange(16.).reshape(4,4),
                   index=['Ohio','Colorado','Utah','California'],
                   columns=['one','two','three','four'])

In [75]:
data['two']

Ohio           1.0
Colorado       5.0
Utah           9.0
California    13.0
Name: two, dtype: float64

In [76]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2.0,0.0
Colorado,6.0,4.0
Utah,10.0,8.0
California,14.0,12.0


In [77]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,5.0,6.0,7.0


In [78]:
data[2:]

Unnamed: 0,one,two,three,four
Utah,8.0,9.0,10.0,11.0
California,12.0,13.0,14.0,15.0


In [83]:
data[data['three']>3]

Unnamed: 0,one,two,three,four
Colorado,4.0,5.0,6.0,7.0
Utah,8.0,9.0,10.0,11.0
California,12.0,13.0,14.0,15.0


### Loc and iLoc
in Pandas if you have an axis index containing integers, data selection will always be label<br>
For more precise handling, use `loc` (for labels) or `iloc` (for integers)

In [39]:
import numpy as np
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [40]:
ser[:1]

0    0.0
dtype: float64

In [41]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [133]:
ser.iloc[:1]

0    0.0
dtype: float64

### Arithmetic
When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs

In [87]:
data1 = pd.Series([7.5, -3.4, 5.3, -2.4], index=['a','b','c','d'])
data2 = pd.Series([3.6, -6.7, 4.9, -4.2, 5.7], index=['a','c','e','f','g'])

In [88]:
data1

a    7.5
b   -3.4
c    5.3
d   -2.4
dtype: float64

In [89]:
data2

a    3.6
c   -6.7
e    4.9
f   -4.2
g    5.7
dtype: float64

In [90]:
data1+data2

a    11.1
b     NaN
c    -1.4
d     NaN
e     NaN
f     NaN
g     NaN
dtype: float64

Gives missing values for the labels that do not overlap<br>
In case of DataFrames, alignment is done on rows and columns

In [42]:
df1 = pd.DataFrame(np.arange(9.).reshape(3,3), columns=list('bcd'),
                  index=['Ohio', 'Texas', 'Colorado'])
print(df1)
df2 = pd.DataFrame(np.arange(12.).reshape(4,3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df2)

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


In [93]:
df1+df2 # Returns the union of all the rows and columns in df1 and df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### With Fill Values

In [94]:
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [97]:
df1.reindex(columns=df2.columns, fill_value=0) # During Reindexing also you can fill different values

Unnamed: 0,b,d,e
Ohio,0.0,2.0,0
Texas,3.0,5.0,0
Colorado,6.0,8.0,0


In [96]:
df2.add(df1, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


### Operations between DataFrame and Series

In [98]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [99]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [100]:
frame-series # Check broadcasting

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union

In [101]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])

In [102]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


### Function Application and Mapping

In [5]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [4]:
import pandas as pd
import numpy as np

In [6]:
frame

Unnamed: 0,b,d,e
Utah,-0.230615,-0.186721,-2.537894
Ohio,0.458077,-0.948036,0.844446
Texas,-0.378658,-0.230795,-0.955194
Oregon,1.828457,-1.034461,-1.121034


In [7]:
np.abs(frame) #Numpy ufunc

Unnamed: 0,b,d,e
Utah,0.230615,0.186721,2.537894
Ohio,0.458077,0.948036,0.844446
Texas,0.378658,0.230795,0.955194
Oregon,1.828457,1.034461,1.121034


`apply` - Apply function on One=Dimensional Array to each column or row

In [8]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    2.207115
d    0.847740
e    3.382339
dtype: float64

function is invoked once on each column. The result is a Series having the columns of frame as its index<br>
You can also invoke fuction once on each row also

In [9]:
frame.apply(f, axis = 'columns')

Utah      2.351172
Ohio      1.792482
Texas     0.724398
Oregon    2.949490
dtype: float64

Function can also return series of values

In [10]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.378658,-1.034461,-2.537894
max,1.828457,-0.186721,0.844446


### Sorting
use `sort_index` and `sort_value`

In [13]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In DataFrame, you can sort on either axis

In [14]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [15]:
frame.sort_index(axis=1) #Soritng on Column

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [16]:
frame.sort_index(axis=1, ascending=False) # Ascending / Descending Order

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


#### Sort by values
`sort_values`

In [17]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [44]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) # missing values in end
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

in DataFrame, you can use data in one or more columns as sort keys. You can use `by` option in `sort_values` methods

In [18]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [19]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [20]:
frame.sort_values(by=['a', 'b']) # use multiple column

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


### Axis Indexes with Duplicate Labels

In [122]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [123]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [124]:
obj.index.is_unique

False

Indexing a label with multiple entries returns a Series, while single entries return a scalar value

In [125]:
obj['a']

a    0
a    1
dtype: int64

In [126]:
obj['c']

4

In [127]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [128]:
df

Unnamed: 0,0,1,2
a,1.526193,0.907145,0.927441
a,-0.271271,0.740651,-0.677026
b,0.937741,0.597982,-1.898749
b,-1.636907,-0.10143,2.442544


In [129]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.937741,0.597982,-1.898749
b,-1.636907,-0.10143,2.442544


## Loading Data

use `read_csv` `read_table`

In [11]:
!type C:\\python-program\\ex1.csv

The system cannot find the path specified.


In [1]:
import pandas as pd
import os
os.getcwd()

'C:\\Users\\LENOVO-PC'

In [2]:
df = pd.read_csv('D:\SAMATRIX\DAP_Dataset\NationalNames.csv')

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 23-24: malformed \N character escape (164005097.py, line 1)

In [5]:
df=pd.read_csv('D:\\SAMATRIX\\DAP_Dataset\\NationalNames.csv', sep=",")

In [6]:
df

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825428,1825429,Zykeem,2014,M,5
1825429,1825430,Zymeer,2014,M,5
1825430,1825431,Zymiere,2014,M,5
1825431,1825432,Zyran,2014,M,5


You can use `read_table` and specify delimiter

In [141]:
pd.read_table('C:\\python-program\\ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [143]:
!type C:\\python-program\\ex2.csv #File without header

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo



C:\\python-program\\ex2.csv


The system cannot find the file specified.
Error occurred while processing: #File.
The system cannot find the file specified.
Error occurred while processing: without.
The system cannot find the file specified.
Error occurred while processing: header.


In [144]:
pd.read_csv('C:\\python-program\\ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
pd.read_csv('C:\\python-program\\ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo
