## Data Cleaning Techniques

In [3]:
import pandas as pd
import numpy as np

In [6]:
# Median based anomaly detection

x = pd.Series([2.1, 2,3, 4.5, 2.2, 2.4]).sort_values()

median = np.median(x)
print(median)
threshold = 2
outliers = [i for i in x if abs(i - threshold) > threshold]
print(outliers)

2.3
[4.5]


In [16]:
# Mean based anomaly detection

mean = x.mean()
print(mean)
std = x.std()
print(std)
outliers_std = [i for i in x if abs(mean - i) > std]
print(outliers_std)

2.6999999999999997
0.950789145920377
[4.5]


In [18]:
# Z-score based anomaly detection
# x = [2.1, 2,3, 4.5, 2.2, 2.4]
# mean = x.mean()
# std = x.std()

outliers_zscore = [i for i in x if abs(i - mean)/std > 1.7]
outliers_zscore


[4.5]

In [21]:
# Interquartile Range (IQR) for anomaly detection
# x = [2.1, 2,3, 4.5, 2.2, 2.4]
Q1, Q3 = np.percentile(x, [25, 75])
print(Q1, Q3)
IQR = Q3 - Q1
outliers_iqr = [i for i in x if (i < (Q1 - 1.5 * IQR)) or (i > (Q3 + 1.5 * IQR))]
print(outliers_iqr)

2.125 2.85
[4.5]


In [22]:
# Missing values
from numpy import NaN
data_1 = {'Name':['Edison', 'Edward', 'James', 'Neesham'],
          'Age': [28, 27, NaN, 36]}

In [23]:
data = pd.DataFrame(data_1)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,
3,Neesham,36.0


In [24]:
data.isnull()

Unnamed: 0,Name,Age
0,False,False
1,False,False
2,False,True
3,False,False


In [26]:
data.isnull().sum()

Name    0
Age     1
dtype: int64

In [27]:
data.dropna(inplace=True)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
3,Neesham,36.0


In [42]:
# fillna
data = pd.DataFrame.from_dict(data_1)
data.fillna(data['Age'].mean(), inplace=True)
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,30.333333
3,Neesham,36.0


In [44]:
data_2 = {'Name':['Edison', 'Edward', 'James', 'Neesham', 'Stuart'],
          'Age': [28, 27, NaN, 36, 27]}
new_data = pd.DataFrame(data_2)
new_data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,
3,Neesham,36.0
4,Stuart,27.0


In [48]:
mode = new_data['Age'].mode()
new_data['Age'].fillna(mode[0], inplace=True)
new_data


Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,27.0
3,Neesham,36.0
4,Stuart,27.0


In [49]:
# Regular Expressions
import re

In [53]:
txt = "Python is my favorite programming language. I love Python."
x = re.findall('Python', txt)
x

['Python', 'Python']

In [54]:
# ^ (Begins with)
sentence = "I love Python."
y = re.findall('^Python', sentence)
y

[]

In [56]:
text = "Python was released in 1991."
print(re.findall('\d', text))
print(re.findall('\d+', text))


['1', '9', '9', '1']
['1991']


In [57]:
textList = ['Pakistan', 'Indonesia', 'Jordan', 'Pakistan']
text_pd = pd.Series(textList)
text_pd

0     Pakistan
1    Indonesia
2       Jordan
3     Pakistan
dtype: object

In [58]:
re.findall('Pakistan', text_pd.to_string())

['Pakistan', 'Pakistan']

In [59]:
# search
hello_text = "Hello World!"
match_object = re.search('World', hello_text)
match_object

<re.Match object; span=(6, 11), match='World'>

In [60]:
match_object.span()

(6, 11)

In [61]:
# sub
org_text = "C is my favorite programming language."
re.sub(pattern='C', repl='Python', string=org_text)

'Python is my favorite programming language.'

## Feature Scaling

In [63]:
# Normalization
# min-max scaling: df = (df - df.min())/(df.max() - df.min())
age_salary = {'Age':[28, 27, 30, 36, 27],
                'Salary ($)':[10_000, 15_000, 11_000, 11_000, 13_000]}
df = pd.DataFrame(age_salary)
df

Unnamed: 0,Age,Salary ($)
0,28,10000
1,27,15000
2,30,11000
3,36,11000
4,27,13000


In [65]:
df_norm = (df - df.min())/(df.max() - df.min())
df_norm

Unnamed: 0,Age,Salary ($)
0,0.111111,0.0
1,0.0,1.0
2,0.333333,0.2
3,1.0,0.2
4,0.0,0.6


In [67]:
# Standardization
# std_value = (original - mean) / std

df_std = (df - df.mean()) / df.std()
df_std

Unnamed: 0,Age,Salary ($)
0,-0.423109,-1.0
1,-0.687552,1.5
2,0.105777,-0.5
3,1.692435,-0.5
4,-0.687552,0.5


In [68]:
df_std.std()

Age           1.0
Salary ($)    1.0
dtype: float64