In [41]:
# Pandas - dataframes
#class pandas.DataFrame(data=None, 
#                         index=None, 
#                         columns=None, 
#                         dtype=None, 
#                         copy=False)[source]
import pandas as pd
import numpy as np

In [42]:
data = {'Country': ['Belgium', 'India', 'Brazil'],
        'Capital': ['Brussels', 'New Delhi', 'Brasilia'],
        'Population': [122020, 7774744, 3664838]}
df = pd.DataFrame(data)
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,122020
1,India,New Delhi,7774744
2,Brazil,Brasilia,3664838


In [43]:
# Constructing DataFrame from a dictionary.
# Example - 1 
data_dict = {'col1': [1, 2], 'col2': [3, 4], 'col3': ['a','b']}

In [44]:
# create the dataframe
df = pd.DataFrame(data=data_dict)
df

Unnamed: 0,col1,col2,col3
0,1,3,a
1,2,4,b


In [45]:
# major dataype
df.dtypes


col1     int64
col2     int64
col3    object
dtype: object

In [46]:
# shape of the dataframe
df.shape

(2, 3)

In [47]:
# Example - 2 
data_dict = {'one' : pd.Series([1., 2., 3.,5.3],     index=['a', 'b', 'c','e']),
             'two' : pd.Series([1., 2., 3., 4.], index=['s', 'b', 'c', 'd'])}

In [48]:
df = pd.DataFrame(data_dict)

In [49]:
df

Unnamed: 0,one,two
a,1.0,
b,2.0,2.0
c,3.0,3.0
d,,4.0
e,5.3,
s,,1.0


In [50]:
df = pd.DataFrame(data_dict, index=['d', 'b', 'a'])
df

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,


In [51]:
df= pd.DataFrame(data_dict, index=['d', 'b', 'a'], columns=['PYTHON', 'SPARK'])
df
#Accessing column which doesnot exists.

Unnamed: 0,PYTHON,SPARK
d,,
b,,
a,,


In [52]:
df= pd.DataFrame(data_dict, index=['d', 'b', 'a'], columns=['two', 'one'])
df

Unnamed: 0,two,one
d,4.0,
b,2.0,2.0
a,,1.0


In [53]:
df.index

Index(['d', 'b', 'a'], dtype='object')

In [54]:
df.columns

Index(['two', 'one'], dtype='object')

In [55]:
# Constructing DataFrame from numpy ndarray/Lists
# Example - 1
data = {'one' : [1., 2., 3., 4.],
        'two' : [4., 3., 2., 1.]}

In [56]:
pd.DataFrame(data)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [57]:
pd.DataFrame(data, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [60]:
# From a list of dicts
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [61]:
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [62]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [63]:
pd.DataFrame(data, columns=['a', 'b'], index=['first', 'second'])

Unnamed: 0,a,b
first,1,2
second,5,10


In [65]:
# Column selection, addition, deletion
data_dict = {'one' : pd.Series([1., 2., 3.],     index=['a', 'b', 'c']),
             'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df= pd.DataFrame(data_dict)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [66]:
# Column selection, addition, deletion

df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [67]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [68]:
df['three'] = df['one'] * df['two']

In [69]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [70]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [71]:
del df['three']
df

Unnamed: 0,one,two,flag
a,1.0,1.0,False
b,2.0,2.0,False
c,3.0,3.0,True
d,,4.0,False


In [72]:
# When inserting a scalar value, it will naturally be propagated to fill 
# the column:
df['foo'] = 'bar'
df

Unnamed: 0,one,two,flag,foo
a,1.0,1.0,False,bar
b,2.0,2.0,False,bar
c,3.0,3.0,True,bar
d,,4.0,False,bar


In [73]:
# handling missing values


In [74]:
series1 = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'])
series2 = pd.Series([1,2,3,4,5], index=['c', 'e', 'f', 'g', 'h'])

In [75]:
sum_series = series1 + series2
sum_series

a    NaN
b    NaN
c    4.0
d    NaN
e    7.0
f    NaN
g    NaN
h    NaN
dtype: float64

In [76]:
sum_series.isnull()

a     True
b     True
c    False
d     True
e    False
f     True
g     True
h     True
dtype: bool

In [77]:
sum_series.dropna()

c    4.0
e    7.0
dtype: float64

In [78]:
sum_series

a    NaN
b    NaN
c    4.0
d    NaN
e    7.0
f    NaN
g    NaN
h    NaN
dtype: float64

In [79]:
dropped_na = sum_series.dropna()
dropped_na

c    4.0
e    7.0
dtype: float64

In [80]:
sum_series.fillna(100)

a    100.0
b    100.0
c      4.0
d    100.0
e      7.0
f    100.0
g    100.0
h    100.0
dtype: float64

In [81]:
# reading from external file
dataset="iris.csv"
df = pd.read_csv(dataset)


In [82]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [84]:
# Assigning New Columns 
df1=df.assign(sepal_ratio = df['sepal.width'] / df['petal.length'])
df1.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,sepal_ratio
0,5.1,3.5,1.4,0.2,Setosa,2.5
1,4.9,3.0,1.4,0.2,Setosa,2.142857
2,4.7,3.2,1.3,0.2,Setosa,2.461538
3,4.6,3.1,1.5,0.2,Setosa,2.066667
4,5.0,3.6,1.4,0.2,Setosa,2.571429


In [85]:
df['sepal.ratio1'] = df['sepal.width'] / df['sepal.length']
#Another way for adding new column

In [86]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,sepal.ratio1
0,5.1,3.5,1.4,0.2,Setosa,0.686275
1,4.9,3.0,1.4,0.2,Setosa,0.612245
2,4.7,3.2,1.3,0.2,Setosa,0.680851
3,4.6,3.1,1.5,0.2,Setosa,0.673913
4,5.0,3.6,1.4,0.2,Setosa,0.72


In [87]:
# Indexing / Selection 

In [88]:
df.loc[140]

sepal.length          6.7
sepal.width           3.1
petal.length          5.6
petal.width           2.4
variety         Virginica
sepal.ratio1     0.462687
Name: 140, dtype: object

In [89]:
# Data alignment and arithmetic

In [90]:
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])

In [91]:
df1

Unnamed: 0,A,B,C,D
0,0.541573,-0.411726,-0.862064,0.325585
1,-0.24742,0.975497,0.398248,0.054182
2,-0.274666,-1.073315,-0.655038,0.397745
3,0.847149,0.883859,0.680061,-1.380975
4,0.52823,-0.728351,0.04766,2.752956
5,-1.58359,0.98896,0.299247,-0.073376
6,0.536448,-0.245262,0.327077,0.053927
7,-1.614727,-0.091774,1.270963,-2.454288
8,0.270479,-2.516682,0.696709,-1.029803
9,1.201773,0.216292,1.468683,0.104216


In [92]:
df2

Unnamed: 0,A,B,C
0,-0.369417,-0.585213,-1.043257
1,1.063684,0.488576,0.351627
2,0.084939,-1.035063,-0.021085
3,-0.142826,1.249822,-1.028223
4,-0.179214,1.627081,1.101896
5,-0.549481,-0.456153,0.704313
6,2.220305,-0.648085,-0.31818


In [93]:
df1 + df2

Unnamed: 0,A,B,C,D
0,0.172156,-0.996939,-1.905321,
1,0.816264,1.464073,0.749875,
2,-0.189726,-2.108378,-0.676124,
3,0.704324,2.133681,-0.348162,
4,0.349015,0.89873,1.149557,
5,-2.133072,0.532807,1.003559,
6,2.756753,-0.893346,0.008897,
7,,,,
8,,,,
9,,,,


In [94]:
# boolean operators
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)

df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)

In [95]:
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [96]:
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [97]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [98]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [99]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [100]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [101]:
# Transposing

In [102]:
df = pd.DataFrame(np.random.randn(8, 3), columns=list('ABC'))

In [103]:
df

Unnamed: 0,A,B,C
0,-0.35597,0.114706,-1.694614
1,0.466469,-1.098946,1.606351
2,0.109063,3.663814,-1.51431
3,-0.904332,-1.4145,0.982715
4,1.00562,0.422432,1.409773
5,0.81859,1.814285,-2.027687
6,-0.083842,0.118467,1.980239
7,-1.344352,-1.053438,1.674619


In [104]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7
A,-0.35597,0.466469,0.109063,-0.904332,1.00562,0.81859,-0.083842,-1.344352
B,0.114706,-1.098946,3.663814,-1.4145,0.422432,1.814285,0.118467,-1.053438
C,-1.694614,1.606351,-1.51431,0.982715,1.409773,-2.027687,1.980239,1.674619
