# Data Cleaning And Preparation Part_1

## How To Handle Missing Values

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.isnull?

In [None]:
pd.notnull?

In [None]:
pd.Series.fillna?

In [None]:
pd.Series.dropna?

In [None]:
ser_miss = pd.Series(['Python', 'Java', 'C', 'Ruby', np.nan])
ser_miss

In [None]:
ser_miss.isnull()

In [None]:
ser_miss[0] = None
ser_miss

In [None]:
ser_miss.isnull()

## Filtering the Missing Values

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

In [None]:
data.dropna()

In [None]:
data[data.notnull()]

In [None]:
data_df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                      [3, 4, 5, 1], 
                      [np.nan, np.nan, np.nan, 5], 
                      [np.nan, 3, np.nan, 4]], 
                      columns=list('ABCD'))
data_df

In [None]:
data_df.dropna()

In [None]:
pd.DataFrame.dropna?

In [None]:
data_df.loc[2, 'D']= np.nan
data_df

In [None]:
data_df.dropna(how='all')

In [None]:
data_df['E'] = np.nan
data_df

In [None]:
data_df.dropna(how='all', axis=1)

In [None]:
print(data_df)
data_df.dropna(thresh=1)

In [None]:
print(data_df)
data_df.dropna(thresh=2)

In [None]:
print(data_df)
data_df.dropna(thresh=4)

#### Filling The Missing Values

In [None]:
print(data_df)

In [None]:
data_df.fillna(50)

In [None]:
data_df.fillna({'A':20, 'B':50, 'C':40, 'D':60, 'E':70})

In [None]:
data_df.fillna({'A':20, 'B':50, 'C':40, 'D':60, 'E':70}, inplace=True)

In [None]:
data_rand = pd.DataFrame(np.random.randn(5, 4))
data_rand

In [None]:
data_rand.iloc[2:, 2]=np.nan
data_rand.iloc[:2, 3]=np.nan
data_rand

In [None]:
data_rand.fillna(method='ffill')

In [None]:
data_rand.fillna(method='bfill')

In [None]:
data_rand.fillna(method='ffill', limit=2)

In [None]:
ser_miss = pd.Series([1, 2, np.nan, 4, np.nan])
ser_miss

In [None]:
ser_miss.fillna(ser_miss.mean())

# Data Cleaning And Preparation Part_2

## Data Transformation

#### How To Handle Duplicate Rows and Values?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.duplicated?

In [None]:
pd.DataFrame.drop_duplicates?

In [None]:
dup = pd.DataFrame({'key1': ['one', 'three'] * 3 + ['three', 'two'], 
              'key2': [1, 1, 2, 3, 3, 4, 4, 5]})
dup

In [None]:
dup.duplicated()

In [None]:
dup.drop_duplicates()

In [None]:
dup['key3'] = np.arange(8)
dup

In [None]:
dup.drop_duplicates(['key2'])

In [None]:
dup.drop_duplicates(['key3'])

In [None]:
dup.drop_duplicates(['key1', 'key2'], keep='last')

## Data Transformation

#### How To Transform Data Usig Functions and/or Mapping?

In [None]:
pd.Series.map?

In [None]:
data_tr = pd.DataFrame({'Names': ['Raja', 'vali', 'Salu', 
                                 'Balu', 'Vali', 'mali'], 
                       'Score': [4, 3, 2, 6, 5, 1,]})
data_tr

In [None]:
match_data_tr = {'raja':'Yellow', 'vali':'Red', 'salu':'Green', 'balu':'Green', 'mali':'Dark'}
match_data_tr

In [None]:
lower_str = data_tr['Names'].str.lower()
lower_str

In [None]:
data_tr['Color'] = lower_str.map(match_data_tr)
data_tr

In [None]:
data_tr['Color'] = data_tr['Names'].map(lambda x: match_data_tr[x.lower()])
data_tr

## Data Transformation

#### How To Replace Values?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.replace?

In [None]:
ser_data = pd.Series([1., -9., 2., -9., -1., 3.])
ser_data

In [None]:
ser_data.replace(-9, np.nan)

In [None]:
ser_data.replace([-9, -1], np.nan)

In [None]:
ser_data.replace([-9, -1], [100, 500])

In [None]:
ser_data.replace({-9:50, -1:60})

## Data Transformation

#### How To Rename Axis Indexes?

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_tor = pd.DataFrame(np.arange(12).reshape((3, 4)), 
             index=['Apple', 'Banana', 'Grapes'], 
             columns=['one', 'two', 'three', 'four'])
data_tor

In [None]:
upper = lambda x: x[:5].upper()
data_tor.index.map(upper)

In [None]:
data_tor.index = data_tor.index.map(upper)

In [None]:
data_tor

In [None]:
pd.DataFrame.rename?

In [None]:
data_tor.rename(index=str.lower, columns=str.title)

In [None]:
data_tor.rename(index={'APPLE': 'Appale'}, columns={'two':2})

## Data Transformation

#### How To Descretize and/or Bin The Data?

In [None]:
import pandas as pd
import numpy as np

In [None]:
 pd.cut?

In [None]:
 pd.qcut?

In [None]:
s_m = [21, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
s_m

In [None]:
bin_size = [20, 35, 45, 60, 100]

In [None]:
binned = pd.cut(s_m, bin_size)

In [None]:
print(s_m)
binned

In [None]:
binned.codes

In [None]:
binned.categories

In [None]:
binned.value_counts()

In [None]:
binned = pd.cut(s_m, bin_size, right=False)
print(binned.categories)
print(binned.value_counts())

In [None]:
rebinned = pd.cut(s_m, bin_size, labels=['Fail', 'Pass', 'SC', 'FCorFCD'], right=False)

In [None]:
print(rebinned.categories)
print(rebinned.value_counts())

In [None]:
rd = np.random.rand(10)
rd

In [None]:
pd.cut(rd, 4, precision=1)

In [None]:
ran_data = np.random.randn(1000) 
ran_data[:20]

In [None]:
quantiles_bins = pd.qcut(ran_data, 6)

In [None]:
quantiles_bins

In [None]:
quantiles_bins.value_counts()

## Data Transformation

#### How To Detect and Filter Outliers?

In [None]:
import pandas as pd
import numpy as np

In [None]:
ug_data = pd.DataFrame({'A':pd.Series(np.arange(10)), 'B':pd.Series(np.arange(5, 15)),'C':pd.Series(np.arange(10, 20)),
                        'D':pd.Series(np.arange(15, 25))})
ug_data

In [None]:
ug_data.describe()

In [None]:
col = ug_data['A']
col[np.abs(col) > 4]

In [None]:
ug_data[(np.abs(ug_data) > 20).any(1)]

## Data Transformation

#### How To Reorder and Select Rondomly?

In [None]:
import pandas as pd
import numpy as np

In [None]:
np.random.permutation?

In [None]:
ran_df = pd.DataFrame(np.arange(20).reshape((5, 4)))
ran_df

In [None]:
sampler = np.random.permutation(5)
sampler

In [None]:
pd.DataFrame.take?

In [None]:
ran_df.take(sampler)

In [None]:
pd.DataFrame.sample?

In [None]:
ran_df.sample(n=2)

In [None]:
rep = ran_df.sample(n=10, replace=True)
rep

## Data Transformation

#### How To Compute Indicator/Dummy Variables?

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.get_dummies?

In [None]:
data_dummies = pd.DataFrame({'key': ['b', 'a', 'a', 'c', 'a'], 'data1': range(5)})
data_dummies

In [None]:
pd.get_dummies(data_dummies['key'])

In [None]:
pd.get_dummies(data_dummies['key'], prefix='Key')

In [None]:
data_dummies_df = data_dummies[['data1']].join(pd.get_dummies(data_dummies['key'], prefix='Key'))
data_dummies_df

In [None]:
np.random.seed(42)
v = np.random.rand(10)
v

In [None]:
bins = [0.1, 0.3, 0.5, 0.7, 1.0]

In [None]:
pd.get_dummies(pd.cut(v, bins))

# Data Cleaning and Preparation Part_3

## How To Manipulate With Strings?

In [None]:
Python_sentence = 'python,Is, a programming, Language'
Python_sentence

In [None]:
str.split?

In [None]:
Python_sentence.split(sep=',')

In [None]:
cs = [x.strip() for x in Python_sentence.split(',')]
cs

In [None]:
str.join?

In [None]:
':'.join(cs)

In [None]:
one, two, three, four = cs
one

In [None]:
one + ':::' + two + '...>' + three + '#1' + four

In [None]:
'python' in cs

In [None]:
str.index?

In [None]:
str.find?

In [None]:
print(Python_sentence)
Python_sentence.index(',')

In [None]:
Python_sentence.index(';')

In [None]:
Python_sentence.find(',')

In [None]:
Python_sentence.find(';')

In [None]:
Python_sentence.count(',')

In [None]:
Python_sentence.replace(',', ':')

In [None]:
Python_sentence.replace(',', '')

## How To Use Regular Expressions?

In [None]:
import re

In [None]:
text = "python Is \ta programming\t Language"
text

In [None]:
re.split?

In [None]:
re.split('\s+', text)

In [None]:
regex = re.compile('\s+')

In [None]:
regex.split(text)

In [None]:
re.findall?

In [None]:
regex.findall(text)

In [None]:
# Note: To avoid unwanted escaping with \ in a regular expression, use raw string literals like r'E:\x' instead of the equivalent 'E:\\x'

In [None]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [None]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
text = """Dave dave@google.com 
Steve steve@gmail.com 
Rob rob@gmail.com 
Ryan ryan@yahoo.com"""
text

In [None]:
regex.findall(text)

In [None]:
regex.search?

In [None]:
regex.search(text)

In [None]:
regex.match?

In [None]:
print(regex.match(text))

In [None]:
regex.sub?

In [None]:
print(regex.sub('Python', text))

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [None]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('Paru@hotmail.com')
m

In [None]:
m.groups()

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

## How To Work With The Vectorized String Functions in Pandas?

In [None]:
import pandas as pd

In [None]:
maild = {'Pruthvi': 'pruthvi@google.com', 'Stella': 'stella@gmail.com', 'Roby': 'roby@gmail.com', 'Navar': np.nan}
maild

In [None]:
maild_s = pd.Series(maild)
maild_s

In [None]:
maild_s.isnull()

In [None]:
maild_s.str.contains('gmail')

In [None]:
pattern

In [None]:
pd.Series.str.findall?

In [None]:
maild_s.str.findall(pattern, flags=re.IGNORECASE)

In [None]:
matching = maild_s.str.match(pattern, flags=re.IGNORECASE)
matching

In [None]:
pd.Series.str.get?

In [None]:
print(maild_s)
maild_s.str.get(2)

In [None]:
maild_s.str[:4]