In [1]:
import pandas as pd
import numpy as np

In [2]:
# easiest way to create series is through list and arrays
mylist = [5.4, 6.1, 1.7, 99.8]
myarray = np.array(mylist)

In [4]:
myseries1 = pd.Series(data=mylist)
print(myseries1)
myseries2 = pd.Series(data=myarray)
print(myseries2)

0     5.4
1     6.1
2     1.7
3    99.8
dtype: float64
0     5.4
1     6.1
2     1.7
3    99.8
dtype: float64


In [5]:
#access cell
print(myseries1[0])


5.4


In [6]:
#create labels to entries of a series
mylabels = ['first', 'second', 'third', 'fourth']
myseries3 = pd.Series(data=mylist, index=mylabels)
print(myseries3)

first      5.4
second     6.1
third      1.7
fourth    99.8
dtype: float64


In [9]:
#or 
myseries4 = pd.Series(mylist,mylabels)
print(myseries4)

first      5.4
second     6.1
third      1.7
fourth    99.8
dtype: float64


In [10]:
print(myseries4['second'])


6.1


In [12]:
# can do math on series. Label is the correleation id during math between two series. NaN b/c series label is not found
myseries5 = pd.Series([5.5,1.1,8.8,1.6], ['first', 'third', 'fourth', 'fifth'])
print(myseries5)
print('')
print(myseries5+myseries4)

first     5.5
third     1.1
fourth    8.8
fifth     1.6
dtype: float64

fifth       NaN
first      10.9
fourth    108.6
second      NaN
third       2.8
dtype: float64


In [13]:
# can do concat two series to create dataframe
df1 = pd.concat([myseries4, myseries5], axis=1, sort=False)
df1


Unnamed: 0,0,1
first,5.4,5.5
second,6.1,
third,1.7,1.1
fourth,99.8,8.8
fifth,,1.6


In [14]:
# axis 0 stacks two series one over another
zzdf1 = pd.concat([myseries4, myseries5], axis=0, sort=False)
zzdf1

first      5.4
second     6.1
third      1.7
fourth    99.8
first      5.5
third      1.1
fourth     8.8
fifth      1.6
dtype: float64

In [15]:
# pd.DataFrame can be used to create dataframe as well
df2 = pd.DataFrame(np.random.randn(5,5))
df2

Unnamed: 0,0,1,2,3,4
0,0.130337,-0.499812,2.209784,0.040506,-0.597592
1,-0.290392,0.57914,-0.807361,0.590733,-0.111571
2,-1.19704,-0.120413,-1.021262,0.579281,-1.109757
3,-0.398302,0.630598,-0.266746,-0.359639,0.167183
4,-0.551412,0.139798,0.322534,-0.462264,0.496114


In [16]:
#labels to rows (index) and columns (columns)
df3 = pd.DataFrame(np.random.randn(5,5), index=['first row', 'second row', 'third row', 'fourth row', 'fifth row'],
                  columns=['first col', 'second col', 'third col', 'fourth col', 'fifth col'] )
df3

Unnamed: 0,first col,second col,third col,fourth col,fifth col
first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271
second row,-1.697397,-0.334572,1.184084,-0.172487,-1.987909
third row,0.962956,0.453435,0.352161,0.207854,-0.257387
fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421
fifth row,0.901745,0.659924,0.270143,1.496105,0.069115


In [18]:
# accessing
print(df3['second col']) # <- single label will fetch only columns and not rows. b/c df is multiple series(columns)
print('')
df3[['third col', 'first col']]

first row     1.442449
second row   -0.334572
third row     0.453435
fourth row   -1.349073
fifth row     0.659924
Name: second col, dtype: float64



Unnamed: 0,third col,first col
first row,-0.550899,-0.376889
second row,1.184084,-1.697397
third row,0.352161,0.962956
fourth row,-1.048137,0.987294
fifth row,0.270143,0.901745


In [19]:
#loc (location) fetches row in series (vertical). Notice not a method
df3.loc['first row']

first col    -0.376889
second col    1.442449
third col    -0.550899
fourth col   -0.424235
fifth col    -0.200271
Name: first row, dtype: float64

In [20]:
#iloc to fetch row using index
df3.iloc[1]


first col    -1.697397
second col   -0.334572
third col     1.184084
fourth col   -0.172487
fifth col    -1.987909
Name: second row, dtype: float64

In [21]:
# loc to get both row and column
df3.loc[['fourth row', 'first row'], ['second col', 'third col']]

Unnamed: 0,second col,third col
fourth row,-1.349073,-1.048137
first row,1.442449,-0.550899


In [22]:
# logical conditions
df3 > 0


Unnamed: 0,first col,second col,third col,fourth col,fifth col
first row,False,True,False,False,False
second row,False,False,True,False,False
third row,True,True,True,True,False
fourth row,True,False,False,False,True
fifth row,True,True,True,True,True


In [23]:
print(df3[df3>0]) #compare the result with matrix. table structure is maintained

            first col  second col  third col  fourth col  fifth col
first row         NaN    1.442449        NaN         NaN        NaN
second row        NaN         NaN   1.184084         NaN        NaN
third row    0.962956    0.453435   0.352161    0.207854        NaN
fourth row   0.987294         NaN        NaN         NaN   0.191421
fifth row    0.901745    0.659924   0.270143    1.496105   0.069115


In [24]:
#add col
df3['sixth col'] = np.random.randn(5,1)
df3

Unnamed: 0,first col,second col,third col,fourth col,fifth col,sixth col
first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
second row,-1.697397,-0.334572,1.184084,-0.172487,-1.987909,0.476543
third row,0.962956,0.453435,0.352161,0.207854,-0.257387,-0.088573
fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421,0.454294
fifth row,0.901745,0.659924,0.270143,1.496105,0.069115,-0.508511


In [25]:
#drop col
df3.drop('first col', axis = 1) #<- why axis = 1

Unnamed: 0,second col,third col,fourth col,fifth col,sixth col
first row,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
second row,-0.334572,1.184084,-0.172487,-1.987909,0.476543
third row,0.453435,0.352161,0.207854,-0.257387,-0.088573
fourth row,-1.349073,-1.048137,-0.082712,0.191421,0.454294
fifth row,0.659924,0.270143,1.496105,0.069115,-0.508511


In [26]:
df3 # first col not dropped yet. 


Unnamed: 0,first col,second col,third col,fourth col,fifth col,sixth col
first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
second row,-1.697397,-0.334572,1.184084,-0.172487,-1.987909,0.476543
third row,0.962956,0.453435,0.352161,0.207854,-0.257387,-0.088573
fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421,0.454294
fifth row,0.901745,0.659924,0.270143,1.496105,0.069115,-0.508511


In [27]:
df4 = df3.drop('first col', axis=1) #d3 = d3.drop... is fine too

In [28]:
df4

Unnamed: 0,second col,third col,fourth col,fifth col,sixth col
first row,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
second row,-0.334572,1.184084,-0.172487,-1.987909,0.476543
third row,0.453435,0.352161,0.207854,-0.257387,-0.088573
fourth row,-1.349073,-1.048137,-0.082712,0.191421,0.454294
fifth row,0.659924,0.270143,1.496105,0.069115,-0.508511


In [35]:
df5 = df3.drop('second row', axis = 0) # <- removed rows 
df5

Unnamed: 0,first col,second col,third col,fourth col,fifth col,sixth col
first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
third row,0.962956,0.453435,0.352161,0.207854,-0.257387,-0.088573
fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421,0.454294
fifth row,0.901745,0.659924,0.270143,1.496105,0.069115,-0.508511


In [30]:
#removing dataframe's label but converts into series.
df5.reset_index()

Unnamed: 0,index,first col,second col,third col,fourth col,fifth col,sixth col
0,first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
1,third row,0.962956,0.453435,0.352161,0.207854,-0.257387,-0.088573
2,fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421,0.454294
3,fifth row,0.901745,0.659924,0.270143,1.496105,0.069115,-0.508511


In [36]:
#inplace=true argument to save the changes
df5.reset_index(inplace=True) # works for drop as well
df5

Unnamed: 0,index,first col,second col,third col,fourth col,fifth col,sixth col
0,first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
1,third row,0.962956,0.453435,0.352161,0.207854,-0.257387,-0.088573
2,fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421,0.454294
3,fifth row,0.901745,0.659924,0.270143,1.496105,0.069115,-0.508511


In [37]:
#add new index names
df5['new name'] = ['new first row', 'new second row', 'new third row', 'new fourth row']

In [38]:
df5
df5.set_index('new name', inplace=True)
df5

Unnamed: 0_level_0,index,first col,second col,third col,fourth col,fifth col,sixth col
new name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
new first row,first row,-0.376889,1.442449,-0.550899,-0.424235,-0.200271,-0.956202
new second row,third row,0.962956,0.453435,0.352161,0.207854,-0.257387,-0.088573
new third row,fourth row,0.987294,-1.349073,-1.048137,-0.082712,0.191421,0.454294
new fourth row,fifth row,0.901745,0.659924,0.270143,1.496105,0.069115,-0.508511
