<b>This file will guide you:
<li>How to load data into pandas dataframe
<li>Handling missing values
<li>Data Cleaning
<li>handling Duplicates
</b>

In [1]:
import pandas as pd
import numpy as np

# Read the dataset

In [2]:
df = pd.read_csv('pandas_ex1.csv')

In [3]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


# Change the column name in the dataframe while reading from the file

In [4]:
# overriding the column names
df= pd.read_csv('pandas_ex1.csv', 
             names=['Col1', 'Col2', 'Col3', 'Col4', 'message'])
df

Unnamed: 0,Col1,Col2,Col3,Col4,message
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


# Donot consider first line as header

In [5]:
# No headers
df= pd.read_csv('pandas_ex1.csv', 
             header= None)
df

Unnamed: 0,0,1,2,3,4
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo


# Setting index while reading

In [6]:
# Suppose you wanted the message column to be the index
names = ['Col1', 'Col2', 'Col3', 'Col4', 'message']
df= pd.read_csv('pandas_ex1.csv', 
                 names= names, index_col= 'message')
df

Unnamed: 0_level_0,Col1,Col2,Col3,Col4
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
message,a,b,c,d
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


# Handling missing values

<b>
<li>Missing data is usually either not present (empty string) or marked by some sentinel value.
<li>By default, pandas uses a set of commonly occurring sentinels, such as NA and NULL
</b>

In [7]:
df = pd.read_csv('pandas_ex2.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2.0,3.0,4,
1,two,5.0,6.0,,8,world
2,three,9.0,10.0,11.0,12,foo
3,four,,,4.0,Globe,
4,five,,,,,Earth


<b>Check if the value is null or not at every location in the whole dataframe </b>

In [8]:
pd.isnull(df)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False
3,False,True,True,False,False,True
4,False,True,True,True,True,False


<b>Check if the value is null or not in a particular column  </b>

In [9]:
pd.isnull(df['a'])

0    False
1    False
2    False
3     True
4     True
Name: a, dtype: bool

<b>Setting your own defination of NULL values along with pandas defaults</b>

In [10]:
# Different NA sentinels can be specified for each column in a dict:
sentinels = {'message': ['foo', 'NA'], 'something': ['two', 'three']}
df = pd.read_csv('pandas_ex3.csv', 
                 nrows=7)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2.0,3.0,4,
1,two,5.0,6.0,,8,world
2,three,9.0,10.0,11.0,12,foo
3,four,,,4.0,Globe,
4,five,,,,,Earth


<b>Using na_values to tell which values can represent NULL</b>

In [11]:
# Different NA sentinels can be specified for each column in a dict:
sentinels = {'message': ['foo', 'NA'], 'something': ['two', 'three']}
df = pd.read_csv('pandas_ex3.csv', 
                 na_values=sentinels, nrows=7)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1.0,2.0,3.0,4,
1,,5.0,6.0,,8,world
2,,9.0,10.0,11.0,12,
3,four,,,4.0,Globe,
4,five,,,,,Earth


# Data Cleaning and Preparation

In [12]:
from numpy import nan as NA
df = pd.DataFrame([[1., 6.5, 3.], 
                     [1.,NA ,NA ],
                     [NA,NA ,NA ], 
                     [NA, 6.5, 3.]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


<b> dropna by default drops any row containing a missing value </b>

In [13]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


<b> Passing <u>how='all'</u> will only drop rows that are all NA</b>

In [14]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.767005,-1.138071,-1.772745
1,0.905906,-0.819671,0.617198
2,-0.380238,-0.636597,-1.463878
3,-0.544569,1.669116,0.042881
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


<b>Set some values to NULL</b>

In [16]:
# upto first 3 rows and col  = 1
df.iloc[:4, 1] = NA
df

Unnamed: 0,0,1,2
0,0.767005,,-1.772745
1,0.905906,,0.617198
2,-0.380238,,-1.463878
3,-0.544569,,0.042881
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


In [17]:
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.767005,,
1,0.905906,,
2,-0.380238,,-1.463878
3,-0.544569,,0.042881
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


In [18]:
df.dropna()

Unnamed: 0,0,1,2
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


# Filling In Missing Data

In [19]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.767005,0.0,0.0
1,0.905906,0.0,0.0
2,-0.380238,0.0,-1.463878
3,-0.544569,0.0,0.042881
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


<b>fillna returns a new object, but you can modify the existing object in-place</b>

In [20]:
df

Unnamed: 0,0,1,2
0,0.767005,,
1,0.905906,,
2,-0.380238,,-1.463878
3,-0.544569,,0.042881
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


In [21]:
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.767005,0.0,0.0
1,0.905906,0.0,0.0
2,-0.380238,0.0,-1.463878
3,-0.544569,0.0,0.042881
4,2.156335,1.047597,-0.631609
5,1.846102,-0.342013,0.226454
6,-0.165501,1.474794,0.971966


# Interpolation methods

In [22]:
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,1.417462,0.270978,-0.505738
1,0.500109,1.495455,1.074341
2,0.366289,-0.644213,-0.09948
3,0.625595,-1.808797,-0.654506
4,-0.299085,-1.02113,0.764471
5,-0.263008,0.44369,0.507892


In [23]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,1.417462,0.270978,-0.505738
1,0.500109,1.495455,1.074341
2,0.366289,,-0.09948
3,0.625595,,-0.654506
4,-0.299085,,
5,-0.263008,,


In [24]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.417462,0.270978,-0.505738
1,0.500109,1.495455,1.074341
2,0.366289,1.495455,-0.09948
3,0.625595,1.495455,-0.654506
4,-0.299085,1.495455,-0.654506
5,-0.263008,1.495455,-0.654506


# Removing Duplicates

In [25]:
df = pd.DataFrame([[1,3],
                  [3,4],
                  [1,3],
                  [5,7],
                  [6,9],
                  [5,7],
                  [1,3]])
                
df

Unnamed: 0,0,1
0,1,3
1,3,4
2,1,3
3,5,7
4,6,9
5,5,7
6,1,3


In [26]:
df.duplicated()

0    False
1    False
2     True
3    False
4    False
5     True
6     True
dtype: bool

<b>drop_duplicates returns a DataFrame where the duplicated array is False</b>

In [27]:
df.drop_duplicates()

Unnamed: 0,0,1
0,1,3
1,3,4
3,5,7
4,6,9


<b>
<li>duplicated and drop_duplicates by default keep the first observed value combination.
<li>Passing keep='last' will return the last one

</b>

In [28]:
df.drop_duplicates(keep='last')

Unnamed: 0,0,1
1,3,4
4,6,9
5,5,7
6,1,3


# Thank You.Happy Coding !!!