## DATAFRAME

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.arange(20,0,-1).reshape(5,-1),
                  index= ['a','b','c','d','e'], columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,15,14,13
c,12,11,10,9
d,8,7,6,5
e,4,3,2,1


#### Replacing values in DataFrame

Syntax-

#### df.replace( toreplace = None, value = None, inplace = False, method = 'pad')

toreplace - str, list, dict, Series, int, float, None, regex

value - scalar, dict, list, str, regex, default = None

method - {'pad','ffill','bfill',None}

In [None]:
df.replace(15,5, inplace = True) 
# We haven't done inplace = True, so it will be temporary change till this particular command

In [None]:
df

Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,5,14,13
c,12,11,10,9
d,8,7,6,5
e,4,3,2,1


In [None]:
df.replace(5, 'Five', inplace = True) 

In [None]:
df['A']=100

In [None]:
df

Unnamed: 0,A,B,C,D
a,100,19,18,17
b,100,Five,14,13
c,100,11,10,9
d,100,7,6,Five
e,100,3,2,1


In [None]:
# List 'toreplace' & Scalar 'value'
df.replace([1,2,3],4)

Unnamed: 0,A,B,C,D
a,100,19,18,17
b,100,Five,14,13
c,100,11,10,9
d,100,7,6,Five
e,100,4,4,4


In [None]:
df['B'].replace('Five',500,inplace=True)
df

Unnamed: 0,A,B,C,D
a,100,19,18,17
b,100,500,14,13
c,100,11,10,9
d,100,7,6,Five
e,100,3,2,1


In [None]:
# List 'toreplace' & 'value'
df.replace([1,2,3,4,5],[11,12,13,14,55])

Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,Five,14,13
c,12,11,10,9
d,8,7,6,Five
e,14,13,12,11


In [None]:
df.iloc[1,1]=np.NaN
df.iloc[1,2]=np.NaN
df

Unnamed: 0,A,B,C,D
a,100,19.0,18.0,17
b,100,,,13
c,100,11.0,10.0,9
d,100,7.0,6.0,9
e,100,3.0,2.0,1


In [None]:
# fillna - use this method to treat nan data
df['B'].fillna(method='bfill',inplace= True)  # fillna(mean_value) , fillna(median), fillna(mode), fillna(custom)
# df['C'].fillna(method='ffill',inplace=True)
df

Unnamed: 0,A,B,C,D
a,100,19.0,18.0,17
b,100,11.0,,13
c,100,11.0,10.0,9
d,100,7.0,6.0,9
e,100,3.0,2.0,1


In [None]:
# Method - Backward fill (method = 'bfill')
print(df)
df.replace([17,2,7,14], method = 'bfill') # Value will be replaced with value present just below the element 

    A   B   C   D
a  20  19  18  17
b  16   5  14  13
c  12  11  10   9
d   8   7   6   5
e   4   3   2   1


Unnamed: 0,A,B,C,D
a,20,19,18,13
b,16,5,10,13
c,12,11,10,9
d,8,3,6,5
e,4,3,2,1


In [None]:
df['D'] = df['D'].replace('Five', method ='ffill')
df

Unnamed: 0,A,B,C,D
a,100,19.0,18.0,17
b,100,11.0,18.0,13
c,100,11.0,10.0,9
d,100,7.0,6.0,9
e,100,3.0,2.0,1


In [None]:
# Method - Foward fill (method = 'ffill')
print(df)
df.replace([5,2,7,14,19], method = 'ffill') # above values are taken for replacement

    A     B   C     D
a  20    19  18    17
b  16  Five  14    13
c  12    11  10     9
d   8     7   6  Five
e   4     3   2     1


Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,Five,18,13
c,12,11,10,9
d,8,11,6,Five
e,4,3,6,1


In [None]:
# Dict type 'toreplace'

print(df)
df.replace({1:'One',2:'Two',11:'Eleven',20:'Twenty',17 : 170})

    A   B   C   D
a  20  19  18  17
b  16  15  14  13
c  12  11  10   9
d   8   7   6   5
e   4   3   2   1


Unnamed: 0,A,B,C,D
a,Twenty,19,18,170
b,16,15,14,13
c,12,Eleven,10,9
d,8,7,6,5
e,4,3,Two,One


In [None]:
# Dict type 'toreplace', Scalar 'values'
# Replace values specific column wise
# Replacing in only columns A & B

print(df)
df.replace( {'D':['Five',13],'B':15},'NEW' )

    A     B   C     D
a  20    19  18    17
b  16  Five  14    13
c  12    11  10     9
d   8     7   6  Five
e   4     3   2     1


Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,Five,14,NEW
c,12,11,10,9
d,8,7,6,NEW
e,4,3,2,1


In [None]:
df[['A','C']].replace({20:'sdfdsgds',8:'eewre',18:156156313, 9:'0000000'})

Unnamed: 0,A,C
a,sdfdsgds,156156313
b,16,14
c,12,10
d,eewre,6
e,4,2


In [None]:
# Dict type 'toreplace', 'values'
# Replace values specific column wise
print(df)
df.replace({'A': {0:100,4:400,8:800},
           'B':{7:'Seven',3:'Three'}})

df.A.replace(20,21)

    A   B   C   D
a  20  19  18  17
b  16  15  14  13
c  12  11  10   9
d   8   7   6   5
e   4   3   2   1


Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,15,14,13
c,12,11,10,9
d,800,Seven,6,5
e,400,Three,2,1


#### NaN values are represented/specified with numpy as - np.nan

#### Check NaN values (Gives True/False)  -    .isnull() /   .isna()

In [None]:
# Replacing values with Nan
df5 = df.replace([20,15,8,1],np.nan)  #  nan values in python not to be given as - Nan, 'nan' , always use np.nan
print(df5)


# # check null value
df5.isna().sum()  # check Null Values - if any - will give True/False
df5.isnull().sum()
df5[['A','B']].isna().sum()

     A     B     C     D
a  100  19.0  18.0  17.0
b  100  11.0   NaN  13.0
c  100  11.0  10.0   9.0
d  100   7.0   6.0   9.0
e  100   3.0   2.0   NaN


A    0
B    0
C    1
D    1
dtype: int64

#### View the dataframe in form of array

In form of arrays , we can easily perform faster operations

In [None]:
df

Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,Five,14,13
c,12,11,10,9
d,8,7,6,Five
e,4,3,2,1


In [None]:
# View the dataframe in form of array
df.values

array([[20, 19, 18, 17],
       [16, 'Five', 14, 13],
       [12, 11, 10, 9],
       [8, 7, 6, 'Five'],
       [4, 3, 2, 1]], dtype=object)

#### Checking Dimension of DataFrame

In [None]:
# Getting Dimensions
df.ndim

2

##### DataFrame columns are in form of series

In [None]:
df['B']
df[['C','A','D']]

Unnamed: 0,C,A,D
a,18,20,17
b,14,16,13
c,10,12,9
d,6,8,5
e,2,4,1


In [None]:
df.B

a    19
b    15
c    11
d     7
e     3
Name: B, dtype: int32

In [None]:
# Checking datatype (of column or anything)
type(df['A'])

pandas.core.series.Series


#### Creating a new column

In [None]:
df['A'] / df['C'] # they both should have same length - df1 has 10 values , df2 should also have 10 values

a    1.111111
b    1.142857
c    1.200000
d    1.333333
e    2.000000
dtype: float64

In [None]:
len(df)

5

In [None]:
np.arange(5)

array([0, 1, 2, 3, 4])

In [None]:
# Creating new column by performing operations on columns
df['new_col'] = df['A'] / df['C']
df['rounded values'] = df['new_col'].round(2)
df['Arange_values'] = np.arange(5)
df

Unnamed: 0,A,B,C,D,new col,new_col,rounded values,Arange_values
a,20,19,18,17,1.111111,1.111111,1.11,0
b,16,Five,14,13,1.142857,1.142857,1.14,1
c,12,11,10,9,1.2,1.2,1.2,2
d,8,7,6,Five,1.333333,1.333333,1.33,3
e,4,3,2,1,2.0,2.0,2.0,4


In [None]:
df['NCC'] = df['new_col'].astype(int)
df

Unnamed: 0,A,B,C,D,new col,new_col,rounded values,Arange_values,NCC
a,20,19,18,17,1.111111,1.111111,1.11,0,1
b,16,Five,14,13,1.142857,1.142857,1.14,1,1
c,12,11,10,9,1.2,1.2,1.2,2,1
d,8,7,6,Five,1.333333,1.333333,1.33,3,1
e,4,3,2,1,2.0,2.0,2.0,4,2


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       5 non-null      int32  
 1   B       5 non-null      int32  
 2   C       5 non-null      int32  
 3   D       5 non-null      float64
 4   E       5 non-null      float64
 5   F       5 non-null      float64
 6   G       5 non-null      int32  
dtypes: float64(3), int32(4)
memory usage: 400.0+ bytes


In [None]:
df

Unnamed: 0,A,B,C,D,new col,new_col,rounded values,Arange_values,NCC
a,20,19,18,17,1.111111,1.111111,1.11,0,1
b,16,Five,14,13,1.142857,1.142857,1.14,1,1
c,12,11,10,9,1.2,1.2,1.2,2,1
d,8,7,6,Five,1.333333,1.333333,1.33,3,1
e,4,3,2,1,2.0,2.0,2.0,4,2


#### Removing a column

In [None]:
# print(df)
# Here 1 represents column axis = (0-Row, 1-Column)
# inplace = True : For permanent change


df.drop(['A','D'],axis = 1, inplace=True) # removing columns
df

Unnamed: 0,B,C,new col,new_col,rounded values,Arange_values,NCC
a,19,18,1.111111,1.111111,1.11,0,1
b,Five,14,1.142857,1.142857,1.14,1,1
c,11,10,1.2,1.2,1.2,2,1
d,7,6,1.333333,1.333333,1.33,3,1
e,3,2,2.0,2.0,2.0,4,2


#### Removing row

In [11]:
df = pd.DataFrame(np.arange(20,0,-1).reshape(5,-1),
                  index= ['a','b','c','d','e'], columns = ['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
a,20,19,18,17
b,16,15,14,13
c,12,11,10,9
d,8,7,6,5
e,4,3,2,1


In [16]:
# axis=0   ---> Rows
# axis=1    ---> columns  
# (row, column)

# dropping a column
# df.drop(['A','D'], axis=1) #  inplace = True


# By default axis = 0, i.e A row with index 'e' will be removed

#df.drop('e')  # Change not permanent as inplace = False by default

#df.drop(['b','a','d'], axis=0)

Unnamed: 0,A,B,C,D
c,12,11,10,9
e,4,3,2,1
