# Pandas


# -Series

In [18]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

In [9]:
obj = pd.Series([5,4,3,2,1])

In [7]:
obj

0    5
1    4
2    3
3    2
4    1
dtype: int64

In [10]:
obj.values

array([5, 4, 3, 2, 1], dtype=int64)

In [11]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [14]:
obj2 = Series([4,5,-4.2,3],index=['a','b','c','d'])

In [15]:
obj2

a    4.0
b    5.0
c   -4.2
d    3.0
dtype: float64

In [21]:
sdata ={'ohio':35000,'Texas':71000,'oregon':16000,'utah':64000}

In [22]:
obj3 = Series(sdata)

In [23]:
obj3

ohio      35000
Texas     71000
oregon    16000
utah      64000
dtype: int64

In [31]:
obj3.name = 'population'
obj3.index.name = 'states'


In [32]:
obj3

states
ohio      35000
Texas     71000
oregon    16000
utah      64000
Name: population, dtype: int64

# DataFrame


In [33]:
data = {'State':['ohio','ohio','ohio','nevada','nevada'],
       'year':[2000,2001,2002,2001,2002],'pop':[1.2,1.7,3.6,2,9]}

In [34]:
df = DataFrame(data)

In [35]:
df

Unnamed: 0,State,year,pop
0,ohio,2000,1.2
1,ohio,2001,1.7
2,ohio,2002,3.6
3,nevada,2001,2.0
4,nevada,2002,9.0


In [36]:
DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,,1.2
1,2001,,1.7
2,2002,,3.6
3,2001,,2.0
4,2002,,9.0


In [37]:
data2 = {'nevada':{2001:2.5 , 2000:2.9},'ohio':{2000:1.5,2001:1.7,2002:3.6}}

In [38]:
df2 = DataFrame(data2)

In [39]:
df2

Unnamed: 0,nevada,ohio
2000,2.9,1.5
2001,2.5,1.7
2002,,3.6


In [40]:
DataFrame(data2,index=[2001,2000,2003])

Unnamed: 0,nevada,ohio
2001,2.5,1.7
2000,2.9,1.5
2003,,


In [43]:
df2.index.name= 'year'
df2.columns.name = 'states'

In [44]:
df2

states,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,2.9,1.5
2001,2.5,1.7
2002,,3.6


# operation on df

In [1]:
import pandas as pd
import numpy as np

In [6]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [7]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns=list('abcd'))
df

Unnamed: 0,a,b,c,d
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [8]:
A = pd.Series([2,4,6],index = [0,1,2])
b = pd.Series([1,3,5],index = [1,2,3])
A+b

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [9]:
A.add(b)

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [10]:
A.add(b,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [11]:
A = pd.DataFrame(rng.randint(0,20,(2,2)),columns=list('ab'))
A

Unnamed: 0,a,b
0,1,11
1,5,1


In [12]:
B = pd.DataFrame(rng.randint(0,20,(3,3)),columns=list('bac'))
B

Unnamed: 0,b,a,c
0,0,11,11
1,16,9,15
2,14,14,18


In [13]:
A+B

Unnamed: 0,a,b,c
0,12.0,11.0,
1,14.0,17.0,
2,,,


In [14]:
fill = A.stack().mean()
fill

4.5

In [15]:
A.add(B,fill_value=fill)

Unnamed: 0,a,b,c
0,12.0,11.0,15.5
1,14.0,17.0,19.5
2,18.5,18.5,22.5


In [16]:
A = rng.randint(10,size=(3,4))
A

array([[6, 3, 8, 2],
       [4, 2, 6, 4],
       [8, 6, 1, 3]])

In [17]:
df = pd.DataFrame(A,columns=list('qrst'))
df

Unnamed: 0,q,r,s,t
0,6,3,8,2
1,4,2,6,4
2,8,6,1,3


In [25]:
df.iloc[0]

q    6
r    3
s    8
t    2
Name: 0, dtype: int32

In [26]:
df-df.iloc[0]

Unnamed: 0,q,r,s,t
0,0,0,0,0
1,-2,-1,-2,2
2,2,3,-7,1


In [27]:
df.sub(df['r'],axis=0)

Unnamed: 0,q,r,s,t
0,3,0,5,-1
1,2,0,4,2
2,2,0,-5,-3


In [30]:
df.sub(df['s'],axis=0)

Unnamed: 0,q,r,s,t
0,-2,-5,0,-6
1,-2,-4,0,-2
2,7,5,0,2


# operation on null values


In [33]:
data = pd.Series([1,np.nan,'hello',None])

In [34]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [35]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [36]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [37]:
data.dropna()

0        1
2    hello
dtype: object

In [38]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [40]:
df = pd.DataFrame([[1,2,np.nan],[4,5,6],[np.nan,7,8]])

In [45]:
df
df[3]=np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,2,,
1,4.0,5,6.0,
2,,7,8.0,


In [46]:
df.dropna(axis='columns',how='all')

Unnamed: 0,0,1,2
0,1.0,2,
1,4.0,5,6.0
2,,7,8.0


In [47]:
df.dropna(axis='rows',thresh=3)

Unnamed: 0,0,1,2,3
1,4.0,5,6.0,


In [48]:
data = pd.Series([1,np.nan,2,None,3],list('abcde'))

In [49]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [50]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [51]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,1.0,2,,
1,4.0,5,6.0,
2,4.0,7,8.0,


In [52]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,1.0,2,6.0,
1,4.0,5,6.0,
2,,7,8.0,


In [61]:
valus = {'0':0,'1':1,'2':2}
df.fillna(0,axis='index',limit=2)

Unnamed: 0,0,1,2,3
0,1.0,2,0.0,0.0
1,4.0,5,6.0,0.0
2,0.0,7,8.0,


In [59]:
df

Unnamed: 0,0,1,2,3
0,1.0,2,,
1,4.0,5,6.0,
2,,7,8.0,


# concat and append

In [1]:
import pandas as pd
import numpy as np

In [20]:
def make_df(cols,ind):
    data = {c:[str(c)+str(i) for i in ind] for c in cols}
    return pd.DataFrame(data,ind)
    

In [21]:
df = make_df('ABC',range(3))
df

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [33]:
ser1 = pd.Series(['A','B','C'],index=[1,2,3])
ser2 = pd.Series(['D','E','F'],index=[4,5,6])


In [34]:
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [35]:
ser2 = pd.Series(ser2.values,index=[1,2,3])
pd.concat([ser1,ser2],ignore_index=True)

0    A
1    B
2    C
3    D
4    E
5    F
dtype: object

In [32]:
ser1.values

array(['A', 'B', 'C'], dtype=object)

In [58]:
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])

In [59]:
print(df1)
print(df2)

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4


In [60]:
pd.concat([df1,df2],)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [61]:
df3  = make_df('AB',[0,1])
df4 = make_df('CD',[0,1])

In [64]:
pd.concat([df3,df4],axis=1,ignore_index=False)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [65]:
x = make_df('AB',[0,1])
y = make_df('AB',[2,3])


In [68]:
y.index=x.index
pd.concat([x,y])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [74]:
pd.concat([x,y],keys=['x','y'])

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


# Concatenation with joins

In [80]:
df5 = make_df('ABC',[1,2])
df6 = make_df('BCD',[1,2])
pd.concat([df5,df6])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
1,,B1,C1,D1
2,,B2,C2,D2


In [81]:
pd.concat([df5,df6],join='inner',keys=['df5','df6'])

Unnamed: 0,Unnamed: 1,B,C
df5,1,B1,C1
df5,2,B2,C2
df6,1,B1,C1
df6,2,B2,C2


# append()

In [82]:
df5.append(df6)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
1,,B1,C1,D1
2,,B2,C2,D2


# merge and join