In [1]:
import numpy as np
import pandas as pd
import copy

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import LinearRegression

import transformers as trans

pd.set_option('display.max_columns', None)

In [2]:
df_train = pd.read_csv('data/train-student-mat.csv')

X_train = df_train.drop(['Unnamed: 0', 'id', 'Final_Score', 'G1', 'G2', 'G3'], axis=1)
y_train = df_train['Final_Score']

df_test = pd.read_csv('data/test-student-mat.csv')
X_test = df_test.drop(['Unnamed: 0', 'id', 'Final_Score', 'G1', 'G2', 'G3'], axis=1)

In [3]:
X_train.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4
1,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10
2,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4
3,GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0
4,GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0


### 1. One Hot Encoding

In [4]:
ohe = trans.OneHotEncoderDF(['Mjob', 'Fjob'])
ohe.fit(X_train)

OneHotEncoderDF(columns=['Mjob', 'Fjob'])

In [5]:
df_trans = ohe.transform(X_test)
df_trans.head(3)

Unnamed: 0,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,school,sex,age,address,famsize,Pstatus,Medu,Fedu,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,0,0,1,0,0,0,0,1,0,0,GP,M,17,U,GT3,T,2,1,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,1,1,3,2
1,1,0,0,0,0,0,0,0,1,0,MS,M,18,R,LE3,T,1,2,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3
2,0,0,1,0,0,0,0,0,1,0,GP,M,18,R,LE3,T,3,3,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8


### 2. Ordinal Encoding

In [6]:
oe = trans.OrdinalEncoderDF(['Mjob', 'Fjob'])
oe.fit(X_train)

OrdinalEncoderDF(columns=['Mjob', 'Fjob'])

In [7]:
df_trans = oe.transform(X_test)
df_trans.head(3)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,M,17,U,GT3,T,2,1,2,2,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,1,1,3,2
1,MS,M,18,R,LE3,T,1,2,0,3,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3
2,GP,M,18,R,LE3,T,3,3,2,3,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8


In [8]:
# shows mapping!
oe.get_transform_dic()

{'Mjob': {'at_home': 0, 'health': 1, 'other': 2, 'services': 3, 'teacher': 4},
 'Fjob': {'at_home': 0, 'health': 1, 'other': 2, 'services': 3, 'teacher': 4}}

### 3. Count of Frequency Encoding

In [9]:
cof = trans.CountOfFreqEncoder(['Mjob', 'Fjob'])
cof.fit(X_train)

CountOfFreqEncoder(columns=['Mjob', 'Fjob'])

In [10]:
df_trans = cof.transform(X_test)
df_trans.head(3)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,M,17,U,GT3,T,2,1,0.33935,0.555957,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,1,1,3,2
1,MS,M,18,R,LE3,T,1,2,0.155235,0.259928,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3
2,GP,M,18,R,LE3,T,3,3,0.33935,0.259928,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8


In [11]:
# shows mapping!
cof.get_transform_dic()

{'Mjob': {'other': 0.33935018050541516,
  'services': 0.2743682310469314,
  'at_home': 0.1552346570397112,
  'teacher': 0.1444043321299639,
  'health': 0.08664259927797834},
 'Fjob': {'other': 0.555956678700361,
  'services': 0.259927797833935,
  'teacher': 0.06498194945848375,
  'at_home': 0.06498194945848375,
  'health': 0.05415162454873646}}

# With target 

### 4. Ordered Integer Target Encoding
Ordering the categories according to the target means assigning a number
to the category from 1 to k, where k is the number of distinct categories
in the variable, but this numbering is informed by the mean of the target
for each category.

In [12]:
oit = trans.OrderedIntTargetEncoder(['Mjob', 'Fjob'])
oit.fit(X_train, y_train)

OrderedIntTargetEncoder(columns=['Mjob', 'Fjob'])

In [13]:
df_trans = oit.transform(X_test)
df_trans.head(3)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,M,17,U,GT3,T,2,1,1,0,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,1,1,3,2
1,MS,M,18,R,LE3,T,1,2,0,1,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3
2,GP,M,18,R,LE3,T,3,3,1,1,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8


In [14]:
# shows mapping!
oit.get_transform_dic()

{'Mjob': {'at_home': 0, 'other': 1, 'services': 2, 'teacher': 3, 'health': 4},
 'Fjob': {'other': 0, 'services': 1, 'at_home': 2, 'health': 3, 'teacher': 4}}

### 4. Aggregation functions Target Encoding
Agg encoding implies replacing the category with the aggregated target value for that category.  
List of functions:
- mean(): Compute mean
- sum(): Compute sum
- size(): Compute group sizes
- count(): Compute count
- std(): Standard deviation
- var(): Compute variance
- sem(): Standard error of the mean
- first(): Compute first of group values
- last(): Compute last of group values
- nth() : Take nth value, or a subset if n is a list
- min(): Compute min
- max(): Compute max

In [15]:
ate = trans.AggTargetEncoder(['Mjob', 'Fjob'], 'mean')
ate.fit(X_train, y_train)

AggTargetEncoder(agg_fun='mean', columns=['Mjob', 'Fjob'])

In [16]:
df_trans = ate.transform(X_test)
df_trans.head(3)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,M,17,U,GT3,T,2,1,10.359574,10.533117,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,1,1,3,2
1,MS,M,18,R,LE3,T,1,2,9.804651,10.748611,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3
2,GP,M,18,R,LE3,T,3,3,10.359574,10.748611,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8


In [17]:
# shows mapping!
ate.get_transform_dic()['Mjob']

{'at_home': 9.804651162790698,
 'health': 12.637500000000001,
 'other': 10.359574468085105,
 'services': 10.831578947368422,
 'teacher': 11.25}

In [18]:
# the same 
df_train.groupby('Mjob')['Final_Score'].mean()

Mjob
at_home      9.804651
health      12.637500
other       10.359574
services    10.831579
teacher     11.250000
Name: Final_Score, dtype: float64

##### The same but with another agg finction

In [19]:
ate = trans.AggTargetEncoder(['Mjob', 'Fjob'], 'var')
ate.fit(X_train, y_train)

AggTargetEncoder(agg_fun='var', columns=['Mjob', 'Fjob'])

In [20]:
df_trans = ate.transform(X_test)
df_trans.head(3)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,M,17,U,GT3,T,2,1,12.456628,13.236478,home,mother,2,1,3,yes,yes,no,yes,yes,no,yes,no,4,5,1,1,1,3,2
1,MS,M,18,R,LE3,T,1,2,13.161883,11.746759,other,father,3,1,0,no,yes,yes,yes,yes,no,yes,yes,4,3,3,2,3,3,3
2,GP,M,18,R,LE3,T,3,3,12.456628,11.746759,course,mother,1,2,1,no,yes,no,no,yes,yes,yes,yes,4,3,3,1,3,5,8


In [21]:
# shows mapping!
ate.get_transform_dic()['Mjob']

{'at_home': 13.161882613510516,
 'health': 11.213749999999994,
 'other': 12.456627773964769,
 'services': 15.526722807017547,
 'teacher': 12.412307692307694}

In [22]:
# the same 
df_train.groupby('Mjob')['Final_Score'].var()

Mjob
at_home     13.161883
health      11.213750
other       12.456628
services    15.526723
teacher     12.412308
Name: Final_Score, dtype: float64

# Pipeline

In [23]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,id,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Final_Score
0,1,1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6,5.4
1,2,2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10,8.5
2,4,4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10,8.8
3,6,6,GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,12,12,11,11.6
4,8,8,GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,yes,no,yes,yes,yes,no,4,2,2,1,1,1,0,16,18,19,17.8


In [24]:
pipe = Pipeline(
    steps=[
        ("ohe", trans.OneHotEncoderDF(['famsup', 'schoolsup', 'famsize'])),
        ("oe", trans.OrdinalEncoderDF(['Mjob', 'Fjob', 'reason'])),
        ("cof", trans.CountOfFreqEncoder(['school', 'sex', 'Pstatus'])),
        ("ote", trans.OrderedIntTargetEncoder(['guardian', 'address'])),
        ("ate_var", trans.AggTargetEncoder(['activities', 'nursery'], 'var')),
        ("ate_mean", trans.AggTargetEncoder(['paid', 'higher', 'internet'], 'mean')),
    ]
)
transformed_df = pipe.fit(X_train, y_train)

In [25]:
df_trans = transformed_df.transform(X_test)
df_trans.head()

Unnamed: 0,famsup_no,famsup_yes,schoolsup_no,schoolsup_yes,famsize_GT3,famsize_LE3,school,sex,age,address,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,0,1,0,1,1,0,0.906137,0.454874,17,1,0.902527,2,1,2,2,1,2,2,1,3,10.19396,13.410439,14.046227,7.205882,10.907692,no,4,5,1,1,1,3,2
1,0,1,1,0,0,1,0.093863,0.454874,18,0,0.902527,1,2,0,3,2,1,3,1,0,11.351562,13.410439,14.046227,7.205882,10.907692,yes,4,3,3,2,3,3,3
2,0,1,1,0,0,1,0.906137,0.454874,18,0,0.902527,3,3,2,3,0,2,1,2,1,10.19396,13.91933,14.046227,10.959231,10.907692,yes,4,3,3,1,3,5,8
3,1,0,1,0,1,0,0.906137,0.545126,16,1,0.097473,2,1,2,2,2,2,1,2,0,11.351562,13.410439,14.046227,10.959231,10.907692,yes,5,3,4,1,1,2,8
4,0,1,1,0,0,1,0.093863,0.454874,20,1,0.097473,2,2,3,3,0,0,1,2,2,11.351562,13.91933,14.046227,10.959231,9.755814,no,5,5,4,4,5,4,11


In [26]:
pipe['oe'].get_transform_dic()

{'Mjob': {'at_home': 0, 'health': 1, 'other': 2, 'services': 3, 'teacher': 4},
 'Fjob': {'at_home': 0, 'health': 1, 'other': 2, 'services': 3, 'teacher': 4},
 'reason': {'course': 0, 'home': 1, 'other': 2, 'reputation': 3}}

In [27]:
pipe['ate_mean'].get_transform_dic()

{'paid': {'no': 10.193959731543625, 'yes': 11.3515625},
 'higher': {'no': 7.205882352941177, 'yes': 10.95923076923077},
 'internet': {'no': 9.755813953488373, 'yes': 10.907692307692308}}