## Data Transformers
When working with Machine Learning...
- It is preferable to use data that has a mean of 0 & standard deviation of 1
- Especially when running linear regression

In [132]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [133]:
X = pd.DataFrame({"A":[1,2,2,3], "B":[11,8,1,3]})
X

Unnamed: 0,A,B
0,1,11
1,2,8
2,2,1
3,3,3


In [134]:
X.describe()

Unnamed: 0,A,B
count,4.0,4.0
mean,2.0,5.75
std,0.816497,4.573474
min,1.0,1.0
25%,1.75,2.5
50%,2.0,5.5
75%,2.25,8.75
max,3.0,11.0


In [135]:
ss = StandardScaler()
ss.fit(X)

In [136]:
ss.mean_

array([2.  , 5.75])

In [137]:
# variance = square of standard deviation
ss.var_

array([ 0.5   , 15.6875])

In [138]:
# transform changes the data to have a mean of 0 and standard deviation of 1
X_scaled = ss.transform(X)
X_scaled

array([[-1.41421356,  1.32550825],
       [ 0.        ,  0.56807496],
       [ 0.        , -1.19926937],
       [ 1.41421356, -0.69431384]])

In [139]:
pd.DataFrame(X_scaled).describe()

Unnamed: 0,0,1
count,4.0,4.0
mean,0.0,0.0
std,1.154701,1.154701
min,-1.414214,-1.199269
25%,-0.353553,-0.820553
50%,0.0,-0.063119
75%,0.353553,0.757433
max,1.414214,1.325508


## Data Preprocessing with Sklearn

In [140]:
import numpy as np
import pandas as pd


In [141]:
df = pd.read_csv("datapreprocessing.csv")
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,20 to 30 kg,Yes
1,Red,4.1,100.0,,10 to 20 kg,No
2,Blue,1.4,,412.0,0 to 10 kg,No
3,Green,,313.0,123.0,10 to 20 kg,Yes
4,,5.2,512.0,372.0,0 to 10 kg,Yes


In [142]:
from sklearn.impute import SimpleImputer

In [143]:
imp = SimpleImputer(missing_values=np.nan,strategy= "mean")

In [144]:
imp.fit(df[["Years","Strength","Height"]])

In [145]:
imp.statistics_

array([  3.25, 283.75, 269.25])

In [146]:
df[["Years","Strength","Height"]] = imp.transform(df[["Years","Strength","Height"]])

In [147]:
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,20 to 30 kg,Yes
1,Red,4.1,100.0,269.25,10 to 20 kg,No
2,Blue,1.4,283.75,412.0,0 to 10 kg,No
3,Green,3.25,313.0,123.0,10 to 20 kg,Yes
4,,5.2,512.0,372.0,0 to 10 kg,Yes


In [148]:
imp.set_params(strategy="most_frequent")

imp.fit(df["Color"].values.reshape(-1,1))

df["Color"] = imp.transform(df["Color"].values.reshape(-1,1))

In [149]:
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,20 to 30 kg,Yes
1,Red,4.1,100.0,269.25,10 to 20 kg,No
2,Blue,1.4,283.75,412.0,0 to 10 kg,No
3,Green,3.25,313.0,123.0,10 to 20 kg,Yes
4,Green,5.2,512.0,372.0,0 to 10 kg,Yes


# Encoding the Categorical Data

In [150]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["Dangerous"] = le.fit_transform(df["Dangerous"])
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,20 to 30 kg,1
1,Red,4.1,100.0,269.25,10 to 20 kg,0
2,Blue,1.4,283.75,412.0,0 to 10 kg,0
3,Green,3.25,313.0,123.0,10 to 20 kg,1
4,Green,5.2,512.0,372.0,0 to 10 kg,1


In [151]:
df.iloc[0,4] = '50 to 60 kg'
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,50 to 60 kg,1
1,Red,4.1,100.0,269.25,10 to 20 kg,0
2,Blue,1.4,283.75,412.0,0 to 10 kg,0
3,Green,3.25,313.0,123.0,10 to 20 kg,1
4,Green,5.2,512.0,372.0,0 to 10 kg,1


In [152]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(dtype=int)
df[["Weight"]] = oe.fit_transform(df[["Weight"]])

In [153]:
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,2,1
1,Red,4.1,100.0,269.25,1,0
2,Blue,1.4,283.75,412.0,0,0
3,Green,3.25,313.0,123.0,1,1
4,Green,5.2,512.0,372.0,0,1


In [154]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(dtype = int,sparse=False)
color_encoded = ohe.fit_transform(df[["Color"]])



In [155]:
color_encoded


array([[0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0]])

In [156]:
ohe.get_feature_names_out()

array(['Color_Blue', 'Color_Green', 'Color_Red'], dtype=object)

In [157]:
df2 = pd.DataFrame(color_encoded,columns=ohe.get_feature_names_out())
df2

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,0,1,0
1,0,0,1
2,1,0,0
3,0,1,0
4,0,1,0


In [158]:
df = pd.concat((df,df2), axis=1)
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous,Color_Blue,Color_Green,Color_Red
0,Green,2.3,210.0,170.0,2,1,0,1,0
1,Red,4.1,100.0,269.25,1,0,0,0,1
2,Blue,1.4,283.75,412.0,0,0,1,0,0
3,Green,3.25,313.0,123.0,1,1,0,1,0
4,Green,5.2,512.0,372.0,0,1,0,1,0


In [159]:
df.drop(columns=["Color"],inplace=True)

In [160]:
df

Unnamed: 0,Years,Strength,Height,Weight,Dangerous,Color_Blue,Color_Green,Color_Red
0,2.3,210.0,170.0,2,1,0,1,0
1,4.1,100.0,269.25,1,0,0,0,1
2,1.4,283.75,412.0,0,0,1,0,0
3,3.25,313.0,123.0,1,1,0,1,0
4,5.2,512.0,372.0,0,1,0,1,0


# Feature Scaling

In [161]:
# Transform features by scaling each feature to a given range. (default at (0,1))
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df[["Years"]] = mms.fit_transform(df[["Years"]])

In [164]:
df

Unnamed: 0,Years,Strength,Height,Weight,Dangerous,Color_Blue,Color_Green,Color_Red
0,0.236842,0.26699,0.16263,2,1,0,1,0
1,0.710526,0.0,0.506055,1,0,0,0,1
2,0.0,0.445995,1.0,0,0,1,0,0
3,0.486842,0.51699,0.0,1,1,0,1,0
4,1.0,1.0,0.861592,0,1,0,1,0


In [163]:
df[["Strength","Height"]] = mms.fit_transform(df[["Strength","Height"]])

In [165]:
df

Unnamed: 0,Years,Strength,Height,Weight,Dangerous,Color_Blue,Color_Green,Color_Red
0,0.236842,0.26699,0.16263,2,1,0,1,0
1,0.710526,0.0,0.506055,1,0,0,0,1
2,0.0,0.445995,1.0,0,0,1,0,0
3,0.486842,0.51699,0.0,1,1,0,1,0
4,1.0,1.0,0.861592,0,1,0,1,0


# pipeline

In [172]:
data = pd.DataFrame([[1], [4], [np.NaN], [8], [11]], columns=['A'])

In [173]:
data

Unnamed: 0,A
0,1.0
1,4.0
2,
3,8.0
4,11.0


In [167]:
from sklearn.pipeline import Pipeline

In [168]:
pipe = Pipeline([
                 ("imp", SimpleImputer(strategy="mean")),
                 ("scaler",MinMaxScaler())
])

In [169]:
pipe["imp"]

In [170]:
pipe["scaler"]

In [174]:
pipe.fit_transform(data)

array([[0. ],
       [0.3],
       [0.5],
       [0.7],
       [1. ]])

# ColumnTransformer

In [175]:
data = pd.DataFrame([[1], [4], [np.NaN], [8], [11]], columns=['A'])

In [176]:
data

Unnamed: 0,A
0,1.0
1,4.0
2,
3,8.0
4,11.0


In [177]:
# Column Transformer applies each step in a sequence manner but NOT on the previous result
from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer([
                  ("imp", SimpleImputer(strategy="mean"), ["A"]),
                  ("scaler",MinMaxScaler(),["A"])
])

In [178]:
column_transformer.fit_transform(data)

array([[ 1. ,  0. ],
       [ 4. ,  0.3],
       [ 6. ,  nan],
       [ 8. ,  0.7],
       [11. ,  1. ]])

# Evaluation Metrics in sklearn

In [49]:
#Classfication metrics
from sklearn.metrics import accuracy_score
true = ['Cat', 'Cat', 'Dog', 'Dog', 'Cat', 'Dog']
pred = ['Cat', 'Cat', 'Cat', 'Dog', 'Cat', 'Cat']
score = accuracy_score(true,pred)
print(score)

0.6666666666666666


In [179]:
# prevision_score = The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 
# recall_score =    The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. 
from sklearn.metrics import precision_score,recall_score
true = ['Cat', 'Cat', 'Dog', 'Dog', 'Cat', 'Dog']
pred = ['Cat', 'Cat', 'Cat', 'Dog', 'Cat', 'Cat']

precision = precision_score(true,pred,pos_label="Dog")
recall = recall_score(true,pred,pos_label="Dog")
print(f"precision is {precision}")
print(f"recall is {recall}")

precision is 1.0
recall is 0.3333333333333333


In [180]:
# Regression metrics

# MSE = average of the square of the errors
# r2 can be anywhere from 0% to 100%
from sklearn.metrics import r2_score, mean_squared_error

pred = [2.1, 1.4, 5.6, 7.9]
true = [2.5, 1.6, 5.1, 6.8]

mse = mean_squared_error(true,pred)
rmse = mean_squared_error(true,pred, squared=False)

In [181]:
r2 = r2_score(true,pred)

In [182]:
print(mse)
print(rmse)
print(r2)

0.41500000000000026
0.6442049363362565
0.902696365767878
