In [1]:
import pandas as pd
import numpy as np
# data has to be a matrix (2 dim numpy array) filled with random data
from numpy.random import randn
# pip install openpyxl

<ol>
    <li>When we created numpy array we pass data</li>
    <li>When we created series we pass data, index</li>
    <li>When we created dataframe we pass data, index, columns</li>
</ol>

In [2]:
np.random.seed(101)
data = randn(5, 4)
data

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [3]:
index = ['A', 'B', 'C', 'D', 'E']
columns = ['W', 'X', 'Y', 'Z']

In [4]:
mydataframe = pd.DataFrame(data, index, columns)
mydataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [5]:
# Now we got the data let us write into an excel file
# we can use the method to_excel
mydataframe.to_excel('dummy-data.xlsx')

In [6]:
# read_excel is a function in pandas module that let us to read the data
# and load it into pandas DataFrame
myfirstdataframe = pd.read_excel('dummy-data.xlsx')
myfirstdataframe

Unnamed: 0.1,Unnamed: 0,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [7]:
mydataframe = pd.DataFrame(data, index, columns)
mydataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Selection and Indexing

In [8]:
# Select a column
mydataframe['W']
# When you select a single column from a DataFrame it returns Series

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [9]:
# Let us select more than a column
mydataframe[['W', 'Y']]
# When you select more than a single column from a DataFrame it returns DataFrame

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813
D,0.188695,-0.933237
E,0.190794,2.605967


In [10]:
# I want to select only one column but I dont want Series I want DataFrame
mydataframe[['W']]

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695
E,0.190794


In [11]:
# How to create a new column
# dataframe['newcolumnname']
mydataframe['total'] = mydataframe['W'] + mydataframe['X'] + mydataframe['Y'] + mydataframe['Z']
mydataframe

Unnamed: 0,W,X,Y,Z,total
A,2.70685,0.628133,0.907969,0.503826,4.746778
B,0.651118,-0.319318,-0.848077,0.605965,0.089688
C,-2.018168,0.740122,0.528813,-0.589001,-1.338233
D,0.188695,-0.758872,-0.933237,0.955057,-0.548357
E,0.190794,1.978757,2.605967,0.683509,5.459028


In [12]:
# How to drop a particular column
mydataframe.drop('total', axis=1, inplace=True, errors='ignore') 
# by default it will try to drop the row with index 'total' 
# to tell drop the column let us use axis = 1
# this is temporary to make it permanent use inplace = True
# after you drop if you try to execute it one more time
# over here definetely you wont try to drop the 'total' twice
# However there are usecases where you may have to drop the column without creating the column
# in such cases it will throw you error
# you can use another keyword argument called errors
# please remember from now no error will appear this can be dangerous

In [13]:
mydataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [14]:
# how to retrieve rows
# mydataframe['A'] # we cannot do this because this syntax is already taken by column selection
# to select a row we must use loc or iloc which we learn in series
mydataframe.loc['A']
# since it is a single row you are getting a Series

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [15]:
mydataframe.loc[['A', 'C', 'E']]
# since it is more than a single row you are getting a DataFrame

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [16]:
# How to retrieve a single row, however I dont want Series I want DataFrame
mydataframe.loc[['A']]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826


In [17]:
# how to retrive rows using generated index
mydataframe.iloc[0]
# since it is a single row you are getting a Series

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [18]:
# since it is more than a single row you are getting a DataFrame
mydataframe.iloc[[0, 2, 4]]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [19]:
# How to retrieve a single row, however I dont want Series I want DataFrame
mydataframe.iloc[[0]]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826


In [20]:
# You can also use range in iloc which you cannot do with loc
mydataframe.iloc[1:4]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [21]:
mydataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [23]:
mydataframe.drop('total', axis=0, inplace=True, errors='ignore')
mydataframe.loc['total'] = mydataframe.sum()
mydataframe
# this looks good however there is a small problem when you try to re-run this
# second time it keep adding the total row also
# Before you do the sum drop the row total

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
total,1.719289,2.268822,2.261436,2.159356


### Conditional Selection

In [24]:
mydataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
total,1.719289,2.268822,2.261436,2.159356


In [28]:
# we want to retrive all the positive values only in the dataframe
mydataframe > 0 # this will give me a dataframe fill with True / False

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True
total,True,True,True,True


In [27]:
mydataframe[mydataframe > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509
total,1.719289,2.268822,2.261436,2.159356


In [30]:
# Now we want to apply this positive number identification only on W column
mydataframe[mydataframe['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
total,1.719289,2.268822,2.261436,2.159356


In [31]:
# we can also use logical operators
mydataframe[(mydataframe['W'] > 0) & (mydataframe['Y'] > 0)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
E,0.190794,1.978757,2.605967,0.683509
total,1.719289,2.268822,2.261436,2.159356


In [32]:
# I want to use conditional selection after the selection
# I dont want all the columns I just want X and Z
# The condition selection returns a data frame which can be chained with normal row/columns access
mydataframe[(mydataframe['W'] > 0) & (mydataframe['Y'] > 0)][['X', 'Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
E,1.978757,0.683509
total,2.268822,2.159356


In [39]:
# Is it possible for us to interate the entire DataFrame
for index, value in mydataframe.iterrows():
    print (index)
    w, x, y, z = value
    print(w, x, y, z)

A
2.706849839399938 0.6281327087844596 0.9079694464765431 0.5038257538223936
B
0.6511179479432686 -0.31931804459303326 -0.8480769834036315 0.6059653494949336
C
-2.018168244037392 0.7401220570561068 0.5288134940893595 -0.5890005332865824
D
0.18869530944922425 -0.758872056210466 -0.9332372163009188 0.9550565092637361
E
0.19079432237171562 1.9787573241128278 2.60596727979128 0.6835088855389145
total
1.7192891751267547 2.268821989149895 2.2614360206526323 2.159355964833395


### Creating our own indexes

In [40]:
mydataframe

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
total,1.719289,2.268822,2.261436,2.159356


In [42]:
# let us drop this total row
mydataframe.drop('total', axis=0, inplace=True)
mydataframe

In [44]:
# Let us reset the indexes
mydataframe.reset_index() # this is not permanent if you want you can use inplace=True

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [46]:
newindex_column = ['Selangor', 'Perak', 'Penang', 'Kedah', 'Johor']
newindex_column

['Selangor', 'Perak', 'Penang', 'Kedah', 'Johor']

In [47]:
# let us add this as a new column to my existing dataframe
mydataframe['state'] = newindex_column
mydataframe

Unnamed: 0,W,X,Y,Z,state
A,2.70685,0.628133,0.907969,0.503826,Selangor
B,0.651118,-0.319318,-0.848077,0.605965,Perak
C,-2.018168,0.740122,0.528813,-0.589001,Penang
D,0.188695,-0.758872,-0.933237,0.955057,Kedah
E,0.190794,1.978757,2.605967,0.683509,Johor


In [50]:
mydataframe.set_index('state', inplace=True)
mydataframe

Unnamed: 0_level_0,W,X,Y,Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Selangor,2.70685,0.628133,0.907969,0.503826
Perak,0.651118,-0.319318,-0.848077,0.605965
Penang,-2.018168,0.740122,0.528813,-0.589001
Kedah,0.188695,-0.758872,-0.933237,0.955057
Johor,0.190794,1.978757,2.605967,0.683509


In [51]:
mydataframe.loc[['Selangor', 'Kedah']]

Unnamed: 0_level_0,W,X,Y,Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Selangor,2.70685,0.628133,0.907969,0.503826
Kedah,0.188695,-0.758872,-0.933237,0.955057


In [56]:
outside_index = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inner_index = [1, 2, 3, 1, 2, 3]
new_index = list(zip(outside_index, inner_index))
# to create multi index we will pass this new_index to MultiIndex.from_tuples function
multi_index = pd.MultiIndex.from_tuples(new_index)
multi_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [57]:
# let us create our dataframe with 6 rows 2 columns
our_dataframe = pd.DataFrame(randn(6, 2), multi_index, ['A', 'B'])
our_dataframe

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [59]:
our_dataframe.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [61]:
our_dataframe.loc['G2'].loc[[1, 3]]

Unnamed: 0,A,B
1,0.166905,0.184502
3,0.638787,0.329646
