In [3]:
import numpy as np
import pandas as pd

### Creation of Series

In [4]:
mylist1 = [10,20,30,40]

In [5]:
#Series objects have indexed labels
pd.Series(data=mylist1)

0    10
1    20
2    30
3    40
dtype: int64

In [6]:
#for custom index labels
pd.Series(data=mylist1, index = ['a', 'b', 'c', 'd'])

a    10
b    20
c    30
d    40
dtype: int64

In [7]:
labels = ['a', 'b', 'c', 'd']
pd.Series(mylist1, labels)

a    10
b    20
c    30
d    40
dtype: int64

#### Creation of series using arrays

In [8]:
array1 = np.array(mylist1)

In [9]:
pd.Series(data=array1)

0    10
1    20
2    30
3    40
dtype: int32

#### Creation of series using dictionaries

In [10]:
mydict1 = {'a':1, 'b':2, 'c':3, 'd':4}

In [11]:
mydict1

{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [12]:
pd.Series(mydict1)

a    1
b    2
c    3
d    4
dtype: int64

In [13]:
#it automatically takes the key of dictionary as index in the series

In [14]:
series1 = pd.Series([10,20,30,40,50], index=['a', 'b', 'c', 'd','e'])
series1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [15]:
series1['d']

40

In [16]:
series1['a']

10

In [17]:
from numpy.random import randint

In [18]:
df=pd.DataFrame(randint(10,100,20).reshape(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])

#after reshape it follows the format, rows, columns

In [19]:
df  #2D data frame

Unnamed: 0,W,X,Y,Z
A,54,77,63,28
B,54,28,12,10
C,95,88,70,18
D,54,64,56,14
E,52,75,49,13


In [20]:
df['Z']

A    28
B    10
C    18
D    14
E    13
Name: Z, dtype: int32

In [21]:
df[['Y', 'Z']]

Unnamed: 0,Y,Z
A,63,28
B,12,10
C,70,18
D,56,14
E,49,13


In [22]:
df[['X', 'Y', 'Z']]

Unnamed: 0,X,Y,Z
A,77,63,28
B,28,12,10
C,88,70,18
D,64,56,14
E,75,49,13


In [23]:
#whenever we are pulling a single column, it will always show an output as series
#whenever we pull up more than one colume, it will always show a data frame output

- Adding a column using existing column

In [24]:
df['R'] = df['X'] + df['Y']

In [25]:
df

Unnamed: 0,W,X,Y,Z,R
A,54,77,63,28,140
B,54,28,12,10,40
C,95,88,70,18,158
D,54,64,56,14,120
E,52,75,49,13,124


In [26]:
#creates a new column r whose values are the sum of values in column X and Y

- Adding a column using single and multiple values

In [27]:
df['S'] = 69

In [28]:
df

Unnamed: 0,W,X,Y,Z,R,S
A,54,77,63,28,140,69
B,54,28,12,10,40,69
C,95,88,70,18,158,69
D,54,64,56,14,120,69
E,52,75,49,13,124,69


In [29]:
# for just adding a column with single values, we just put the value as in this case 69
#if we need to add a column with different values, we input a list of values

In [30]:
df['T'] = [69,70,80,81,82]

In [31]:
df

Unnamed: 0,W,X,Y,Z,R,S,T
A,54,77,63,28,140,69,69
B,54,28,12,10,40,69,70
C,95,88,70,18,158,69,80
D,54,64,56,14,120,69,81
E,52,75,49,13,124,69,82


- deleting a column row and column wise

In [32]:
df.drop('S', axis = 1)

Unnamed: 0,W,X,Y,Z,R,T
A,54,77,63,28,140,69
B,54,28,12,10,40,70
C,95,88,70,18,158,80
D,54,64,56,14,120,81
E,52,75,49,13,124,82


In [33]:
df

Unnamed: 0,W,X,Y,Z,R,S,T
A,54,77,63,28,140,69,69
B,54,28,12,10,40,69,70
C,95,88,70,18,158,69,80
D,54,64,56,14,120,69,81
E,52,75,49,13,124,69,82


In [34]:
sf = df.drop('S', axis = 1) #this will delete the column S and then storre it in another data frame sf

In [35]:
sf

Unnamed: 0,W,X,Y,Z,R,T
A,54,77,63,28,140,69
B,54,28,12,10,40,70
C,95,88,70,18,158,80
D,54,64,56,14,120,81
E,52,75,49,13,124,82


In [36]:
#directly deleting a column permanently without storing it another data frame

In [37]:
df.drop('R', axis=1, inplace=True)

In [38]:
df

Unnamed: 0,W,X,Y,Z,S,T
A,54,77,63,28,69,69
B,54,28,12,10,69,70
C,95,88,70,18,69,80
D,54,64,56,14,69,81
E,52,75,49,13,69,82


In [39]:
#for dropping multiple columns

In [40]:
df.drop(['A', 'B']) #dropping row
#if we use inplace = True, it will permanently delete it instead of just displaying

Unnamed: 0,W,X,Y,Z,S,T
C,95,88,70,18,69,80
D,54,64,56,14,69,81
E,52,75,49,13,69,82


In [41]:
df.drop(['W', 'X'], axis=1)

Unnamed: 0,Y,Z,S,T
A,63,28,69,69
B,12,10,69,70
C,70,18,69,80
D,56,14,69,81
E,49,13,69,82


In [42]:
df

Unnamed: 0,W,X,Y,Z,S,T
A,54,77,63,28,69,69
B,54,28,12,10,69,70
C,95,88,70,18,69,80
D,54,64,56,14,69,81
E,52,75,49,13,69,82


In [43]:
#if we dont speicfy axis, it takes axis = 0 by default

### Accessing rows and columns:

- 1) .loc method

In [44]:
#labelled indexing

In [48]:
df.loc['A'] #accessing rows

W    54
X    77
Y    63
Z    28
S    69
T    69
Name: A, dtype: int64

In [49]:
df.loc['B']

W    54
X    28
Y    12
Z    10
S    69
T    70
Name: B, dtype: int64

- 2) i.loc

In [50]:
#numbered or integer indexing

In [52]:
df.iloc[0] #accessing first row

W    54
X    77
Y    63
Z    28
S    69
T    69
Name: A, dtype: int64

In [53]:
#for accessing more than one rows and columns

In [56]:
df.loc[['A', 'B'], ['X', 'Z']]

Unnamed: 0,X,Z
A,77,28
B,28,10


In [58]:
df.iloc[0:2,[1,3]]

Unnamed: 0,X,Z
A,77,28
B,28,10


In [57]:
df

Unnamed: 0,W,X,Y,Z,S,T
A,54,77,63,28,69,69
B,54,28,12,10,69,70
C,95,88,70,18,69,80
D,54,64,56,14,69,81
E,52,75,49,13,69,82


In [59]:
df.loc[['A', 'B'], ['W', 'X']]

Unnamed: 0,W,X
A,54,77
B,54,28


In [61]:
df.iloc[0:2,[0,1]]

Unnamed: 0,W,X
A,54,77
B,54,28


- Conditional Selection

In [65]:
newdf = df>50 #produces a df with values true and false accodring to the condition

In [63]:
newdf

Unnamed: 0,W,X,Y,Z,S,T
A,True,True,True,False,True,True
B,True,False,False,False,True,True
C,True,True,True,False,True,True
D,True,True,True,False,True,True
E,True,True,False,False,True,True


In [66]:
df[newdf] #this takes the true false values from the df and then displays the values corresponding to true

Unnamed: 0,W,X,Y,Z,S,T
A,54,77.0,63.0,,69,69
B,54,,,,69,70
C,95,88.0,70.0,,69,80
D,54,64.0,56.0,,69,81
E,52,75.0,,,69,82


In [67]:
df[df>35] #direct way to the above two cells

Unnamed: 0,W,X,Y,Z,S,T
A,54,77.0,63.0,,69,69
B,54,,,,69,70
C,95,88.0,70.0,,69,80
D,54,64.0,56.0,,69,81
E,52,75.0,49.0,,69,82


In [68]:
df[df['X'] > 35] #only applies to the provided columns

Unnamed: 0,W,X,Y,Z,S,T
A,54,77,63,28,69,69
C,95,88,70,18,69,80
D,54,64,56,14,69,81
E,52,75,49,13,69,82


In [77]:
df[df['X'] < 65] #display rows where in columnx X, values are less than 65

Unnamed: 0,W,X,Y,Z,S,T
B,54,28,12,10,69,70
D,54,64,56,14,69,81


In [81]:
df[df['X'] < 65][['Y', 'Z']]  #display rows of Y and Z where in columnx X,Y and Z values are less than 65

Unnamed: 0,Y,Z
B,12,10
D,56,14


In [80]:
df

Unnamed: 0,W,X,Y,Z,S,T
A,54,77,63,28,69,69
B,54,28,12,10,69,70
C,95,88,70,18,69,80
D,54,64,56,14,69,81
E,52,75,49,13,69,82


- Conditional selection (multiple)

In [82]:
df[(df['X']<65) & (df['Y']<40)]

Unnamed: 0,W,X,Y,Z,S,T
B,54,28,12,10,69,70


In [83]:
#we use & instead of and for series

In [84]:
df[(df['X']<65) | (df['Y']<40)]

Unnamed: 0,W,X,Y,Z,S,T
B,54,28,12,10,69,70
D,54,64,56,14,69,81


In [85]:
#we use | instead of or in series

- Resetting index

In [86]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,S,T
0,A,54,77,63,28,69,69
1,B,54,28,12,10,69,70
2,C,95,88,70,18,69,80
3,D,54,64,56,14,69,81
4,E,52,75,49,13,69,82


In [87]:
#it just displays the data frame with the reset index and the older index in a seperate column and actually doesn't alter
#the data frame

In [88]:
#for permanent resetting of index, we use inplace = true

- Setting a new index

In [89]:
names = ['AA', 'BB', 'CC', 'DD', 'EE']
df['NewCol'] = names

In [90]:
df

Unnamed: 0,W,X,Y,Z,S,T,NewCol
A,54,77,63,28,69,69,AA
B,54,28,12,10,69,70,BB
C,95,88,70,18,69,80,CC
D,54,64,56,14,69,81,DD
E,52,75,49,13,69,82,EE


In [91]:
df.set_index('NewCol')

Unnamed: 0_level_0,W,X,Y,Z,S,T
NewCol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AA,54,77,63,28,69,69
BB,54,28,12,10,69,70
CC,95,88,70,18,69,80
DD,54,64,56,14,69,81
EE,52,75,49,13,69,82


### Treatment of Missing Values

In [92]:
df=pd.DataFrame({'A':[1,2,np.nan,4,7], 'B':[np.nan,3.5,4,5,8], 'C':[3,np.nan,2.5,8,9],
                'D':[4,np.nan,np.nan,np.nan,9], 'E':[9,8,6,6,4]})

In [93]:
df

Unnamed: 0,A,B,C,D,E
0,1.0,,3.0,4.0,9
1,2.0,3.5,,,8
2,,4.0,2.5,,6
3,4.0,5.0,8.0,,6
4,7.0,8.0,9.0,9.0,4


In [94]:
#np.nan stands for na values

In [95]:
df.dropna()

Unnamed: 0,A,B,C,D,E
4,7.0,8.0,9.0,9.0,4


In [96]:
#drops every row with missing values

In [97]:
#doesn't affect the original dataframe unless inplace = true

In [98]:
df.dropna(axis=1)

Unnamed: 0,E
0,9
1,8
2,6
3,6
4,4


In [99]:
#drops every column with even a single null value

In [100]:
df.dropna(thresh=4) #keep rows which has 4 values and drop every row which has less than 4 values or more than 1 null value

Unnamed: 0,A,B,C,D,E
0,1.0,,3.0,4.0,9
3,4.0,5.0,8.0,,6
4,7.0,8.0,9.0,9.0,4


In [102]:
df.dropna(thresh=4, axis=1) #keep rows which has 4 values and drop every column which has less than 4 values or more than 1 null value

Unnamed: 0,A,B,C,E
0,1.0,,3.0,9
1,2.0,3.5,,8
2,,4.0,2.5,6
3,4.0,5.0,8.0,6
4,7.0,8.0,9.0,4


#### Fill NA method

In [103]:
df.fillna(value = "My Value")

Unnamed: 0,A,B,C,D,E
0,1,My Value,3,4,9
1,2,3.5,My Value,My Value,8
2,My Value,4,2.5,My Value,6
3,4,5,8,My Value,6
4,7,8,9,9,4


In [104]:
#fills every null value with the value provided by you

In [105]:
df.fillna(value = 100)

Unnamed: 0,A,B,C,D,E
0,1.0,100.0,3.0,4.0,9
1,2.0,3.5,100.0,100.0,8
2,100.0,4.0,2.5,100.0,6
3,4.0,5.0,8.0,100.0,6
4,7.0,8.0,9.0,9.0,4


- Fillna using central values

In [106]:
df['A']

0    1.0
1    2.0
2    NaN
3    4.0
4    7.0
Name: A, dtype: float64

In [107]:
df['A'].fillna(value= df['A'].mean()) 

0    1.0
1    2.0
2    3.5
3    4.0
4    7.0
Name: A, dtype: float64

In [108]:
#we filled the missing values in the column A using the mean value of column A

In [111]:
#since we selected column A, rest columns wont be affected and even this is just for display

In [112]:
#unless we use incline = true, we cant permanently change the data frame

In [113]:
df['A'].fillna(value= df['A'].median()) 

0    1.0
1    2.0
2    3.0
3    4.0
4    7.0
Name: A, dtype: float64

In [114]:
df['A'].fillna(value= df['A'].mode()) 

0    1.0
1    2.0
2    4.0
3    4.0
4    7.0
Name: A, dtype: float64

- Fiilna using for loop

In [117]:
df.columns # this gives all column names

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [119]:
for x in df.columns:
    df[x]=df[x].fillna(value=df[x].mean())

In [120]:
df

Unnamed: 0,A,B,C,D,E
0,1.0,5.125,3.0,4.0,9
1,2.0,3.5,5.625,6.5,8
2,3.5,4.0,2.5,6.5,6
3,4.0,5.0,8.0,6.5,6
4,7.0,8.0,9.0,9.0,4


In [124]:
#this permanently changes the df everywhere where a null value is present

In [125]:
for x in df.columns:
    df[x]=df[x].fillna(value=df.mean())

In [126]:
df

Unnamed: 0,A,B,C,D,E
0,1.0,5.125,3.0,4.0,9
1,2.0,3.5,5.625,6.5,8
2,3.5,4.0,2.5,6.5,6
3,4.0,5.0,8.0,6.5,6
4,7.0,8.0,9.0,9.0,4
