# Data Preprocessing

In [1]:
import pandas as pd
data = pd.read_csv('data1.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,No
4,Germany,,,Yes
5,France,,58000.0,Yes
6,Spain,,52000.0,No
7,France,,79000.0,Yes
8,Germany,,,
9,France,,67000.0,Yes


1. Missing Values

> Dropping

In [2]:
# Dropping columns in the data having higher than 60% threshold missing values 
data_drop = data[data.columns[data.isnull().mean() < 0.6]] 
data_drop

Unnamed: 0,Country,Salary,Purchased
0,France,72000.0,No
1,Spain,48000.0,Yes
2,Germany,54000.0,No
3,Spain,61000.0,No
4,Germany,,Yes
5,France,58000.0,Yes
6,Spain,52000.0,No
7,France,79000.0,Yes
8,Germany,,
9,France,67000.0,Yes


In [3]:
# Dropping rows in the data having higher than 60% threshold missing values
data_drop = data.loc[data.isnull().mean(axis=1) < 0.6] 
data_drop

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,No
4,Germany,,,Yes
5,France,,58000.0,Yes
6,Spain,,52000.0,No
7,France,,79000.0,Yes
9,France,,67000.0,Yes


In [4]:
#Dropping columns in the data higher than 60% threshold 
data_drop = data[data.columns[data.isnull().mean() < 0.6]] 
#Dropping rows in the data higher than 60% threshold 
data_drop = data.loc[data.isnull().mean(axis=1) < 0.6] 
data_drop


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,No
4,Germany,,,Yes
5,France,,58000.0,Yes
6,Spain,,52000.0,No
7,France,,79000.0,Yes
9,France,,67000.0,Yes


>Numerical Imputation

In [5]:
data.median(numeric_only=True)

Age          37.0
Salary    59500.0
dtype: float64

In [6]:
#For replacing missed values with median of columns 
data_med = data.fillna(data.median(numeric_only=True))
data_med

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,37.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,37.0,61000.0,No
4,Germany,37.0,59500.0,Yes
5,France,37.0,58000.0,Yes
6,Spain,37.0,52000.0,No
7,France,37.0,79000.0,Yes
8,Germany,37.0,59500.0,
9,France,37.0,67000.0,Yes


In [7]:
#For filling all the missed values as 0 
data_zero = data.fillna(0) 
data_zero

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,0.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,0.0,61000.0,No
4,Germany,0.0,0.0,Yes
5,France,0.0,58000.0,Yes
6,Spain,0.0,52000.0,No
7,France,0.0,79000.0,Yes
8,Germany,0.0,0.0,0
9,France,0.0,67000.0,Yes


> Categorical Imputation

In [8]:
# replacing the missed values in the data with the one which occurs the maximum number of times in the column
data['Age'].fillna(data['Age'].value_counts().idxmax())
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,,61000.0,No
4,Germany,,,Yes
5,France,,58000.0,Yes
6,Spain,,52000.0,No
7,France,,79000.0,Yes
8,Germany,,,
9,France,,67000.0,Yes


2. Outliers

In [9]:
data2 = pd.read_csv('data2.csv')
data2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,474.0,72000.0,No
1,Spain,2700.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,3840.0,61000.0,No
4,Germany,40.0,,Yes
5,France,325.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,5110.0,83000.0,No
9,France,3.0,67000.0,Yes


In [10]:
#For identifying the outliers with the standard deviation method 
outliers = [x for x in data2['Age'] if x < 20 or x > 80] 
print('Identified outliers: %d' % len(outliers)) 
#Remove outliers 
outliers_removed = [x for x in data2['Age'] if x >= 20 and x <= 80] 
print('Non-outlier observations: %d' % len(outliers_removed)) 

Identified outliers: 6
Non-outlier observations: 3


3. Overfitting

In [11]:
stocks_data = pd.DataFrame({'value': [102,300,107,470] ,'bin':[None,None,None,None]})
stocks_data['bin'] = pd.cut(stocks_data['value'], bins=[100,250,400,500], labels=["Lowest", "Mid", "High"])
stocks_data

Unnamed: 0,value,bin
0,102,Lowest
1,300,Mid
2,107,Lowest
3,470,High


4. Data with no numerical values

In [12]:
#Convert data into numerical values with mean 
Infected = [2, 4, 5, 6, 4, 3] 
Predictor = ['Delta', 'Lambda', 'Omicron', 'Lambda', 'Delta', 'Omicron'] 
Infected_df = pd.DataFrame(data={'Infected':Infected, 'Predictor':Predictor}) 
means = Infected_df.groupby('Predictor')['Infected'].mean() 
Infected_df['Predictor_encoded'] = Infected_df['Predictor'].map(means) 
Infected_df

Unnamed: 0,Infected,Predictor,Predictor_encoded
0,2,Delta,3.0
1,4,Lambda,5.0
2,5,Omicron,4.0
3,6,Lambda,5.0
4,4,Delta,3.0
5,3,Omicron,4.0


5. Different date formats 

In [13]:
df = pd.DataFrame(["5-1-2019","8-3-2019","3-3-2019","27-1-2019","8-2-2019"], columns=['Date'])
#Convert to datetime object 
df['Date'] = pd.to_datetime(df['Date'],dayfirst=True) 
#Decomposition 
df['Year'] = df['Date'].dt.year 
df['Month'] = df['Date'].dt.month 
df['Day'] = df['Date'].dt.day 
df[['Year','Month','Day']].head() 

Unnamed: 0,Year,Month,Day
0,2019,1,5
1,2019,3,8
2,2019,3,3
3,2019,1,27
4,2019,2,8


# Regression

Step 1: Import packages and classes 

In [14]:
import numpy as np 
from sklearn.linear_model import LinearRegression

Step 2: Provide data 

In [15]:
x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1)) 
y = np.array([5, 20, 14, 32, 22, 38])

In [16]:
x

array([[ 5],
       [15],
       [25],
       [35],
       [45],
       [55]])

In [17]:
y

array([ 5, 20, 14, 32, 22, 38])

Step 3: Create a model and fit it 

In [18]:
model = LinearRegression().fit(x, y)

Step 4: Get results

In [19]:
r_sq = model.score(x, y) 
print(f"coefficient of determination: {r_sq}\n") 

print(f"intercept: {model.intercept_}\n") 
 
print(f"slope: {model.coef_}\n") 

coefficient of determination: 0.7158756137479542

intercept: 5.633333333333329

slope: [0.54]



Step 5: Predict response 

In [20]:
y_pred = model.predict(x) 
print(f"predicted response:\n{y_pred}\n") 

y_pred = model.intercept_ + model.coef_ * x 
print(f"predicted response:\n{y_pred}") 

predicted response:
[ 8.33333333 13.73333333 19.13333333 24.53333333 29.93333333 35.33333333]

predicted response:
[[ 8.33333333]
 [13.73333333]
 [19.13333333]
 [24.53333333]
 [29.93333333]
 [35.33333333]]
