# Read and Prepare the Data

In [1]:
# Common imports
import numpy as np
import pandas as pd
np.random.seed(42)

In [2]:
#We will predict the "Alc" i.e Weekend alcohol consumption in the data set:

alc = pd.read_csv("alcohol.csv")
alc.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


In [3]:
#Creating Test Train Split
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(alc, test_size=0.3)

In [4]:
#Checking for Missing Values
train_set.isna().sum()
test_set.isna().sum()

age           0
Medu          0
Fedu          0
traveltime    0
studytime     0
failures      0
famrel        0
freetime      0
goout         0
health        0
absences      0
gender        0
alc           0
dtype: int64

# Data Prep

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

In [6]:
#Seperating the Target Variable
train_y = train_set['alc']
test_y = test_set['alc']

train_inputs = train_set.drop(['alc'], axis=1)
test_inputs = test_set.drop(['alc'], axis=1)

# Feature Engineering: Deriving new coloumn from Absences

In [7]:
def new_col(df):
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()
    
    df1['absences_binned'] = pd.cut(df1['absences'],
                                       bins=[0,1,5,10,20,50],
                                       labels=False, 
                                       include_lowest=True)
 
    return df1[['absences_binned']]

In [8]:
train_inputs.dtypes

age            int64
Medu           int64
Fedu           int64
traveltime     int64
studytime      int64
failures       int64
famrel         int64
freetime       int64
goout          int64
health         int64
absences       int64
gender        object
dtype: object

In [9]:
# Identifying the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identifying the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [10]:
# Identifying the binary columns so we can pass them through without transforming
binary_columns = []

In [11]:
# Removing the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

## List of Column Types

In [12]:
binary_columns

[]

In [13]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences']

In [14]:
categorical_columns

['gender']

In [15]:
transformed_columns = ['absences']

# Pipeline

In [16]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [17]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [18]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [19]:
my_new_column = Pipeline(steps=[('my_new_column', FunctionTransformer(new_col)),
                               ('scaler', StandardScaler())])

In [20]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns),
        ('trans', my_new_column, transformed_columns)],
        remainder='passthrough')


## Transform: fit_transform() for TRAIN

In [21]:
#Fit and transforming the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.66643886,  0.96597412,  0.90362635, ...,  1.        ,
         0.        ,  0.77953176],
       [ 0.66643886, -0.93881619, -1.68666277, ...,  0.        ,
         1.        ,  2.59906354],
       [ 0.66643886,  0.33104402,  0.04019664, ...,  0.        ,
         1.        , -1.04000003],
       ...,
       [ 0.66643886, -2.20867639, -2.55009248, ...,  1.        ,
         0.        ,  1.68929765],
       [ 1.6195814 , -0.30388608, -1.68666277, ...,  0.        ,
         1.        ,  1.68929765],
       [ 1.6195814 , -0.30388608, -2.55009248, ...,  0.        ,
         1.        ,  0.77953176]])

In [22]:
train_x.shape

(23800, 14)

## Tranform: transform() for TEST

In [23]:
# Transforming the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.23984621,  0.33104402,  1.76705606, ...,  1.        ,
         0.        , -0.13023413],
       [-1.23984621, -0.30388608,  0.04019664, ...,  0.        ,
         1.        , -1.04000003],
       [-0.28670367,  0.33104402,  0.04019664, ...,  0.        ,
         1.        , -1.04000003],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ...,  1.        ,
         0.        , -0.13023413],
       [-1.23984621, -0.93881619,  0.04019664, ...,  0.        ,
         1.        , -1.04000003],
       [-1.23984621,  0.96597412,  0.04019664, ...,  1.        ,
         0.        , -1.04000003]])

In [24]:
test_x.shape

(10200, 14)

# Baseline:

In [25]:
train_y.value_counts()/len(train_y)

0    0.523487
1    0.476513
Name: alc, dtype: float64

# Hard voting classifier (should include at least two models)

In [26]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


dtree_clf = DecisionTreeClassifier(max_depth=6, min_samples_split=5, min_samples_leaf=8)
log_clf_1 = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=0.1, max_iter=10000)
log_clf_2 = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=1, max_iter=10000)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3, l1_ratio=0.8)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr1', log_clf_1),
                        ('lr2', log_clf_2),
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_y)

VotingClassifier(estimators=[('dt',
                              DecisionTreeClassifier(max_depth=6,
                                                     min_samples_leaf=8,
                                                     min_samples_split=5)),
                             ('lr1',
                              LogisticRegression(C=0.1, max_iter=10000,
                                                 multi_class='multinomial')),
                             ('lr2',
                              LogisticRegression(C=1, max_iter=10000,
                                                 multi_class='multinomial')),
                             ('sgd',
                              SGDClassifier(l1_ratio=0.8, max_iter=10000))])

In [27]:
from sklearn.metrics import accuracy_score
#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8221848739495798


In [28]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8192156862745098


# Soft voting classifier (should include at least two models)

In [29]:
voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr1', log_clf_1),
                       ('lr2', log_clf_2)],
            voting='soft')

voting_clf.fit(train_x, train_y)

VotingClassifier(estimators=[('dt',
                              DecisionTreeClassifier(max_depth=6,
                                                     min_samples_leaf=8,
                                                     min_samples_split=5)),
                             ('lr1',
                              LogisticRegression(C=0.1, max_iter=10000,
                                                 multi_class='multinomial')),
                             ('lr2',
                              LogisticRegression(C=1, max_iter=10000,
                                                 multi_class='multinomial'))],
                 voting='soft')

In [30]:
#Train accuracy

train_y_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8224789915966386


In [31]:
#Test accuracy

test_y_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8203921568627451


# Bagging classifier

In [32]:
from sklearn.ensemble import BaggingClassifier 

bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=50, 
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_y)

BaggingClassifier(base_estimator=SGDClassifier(), max_samples=1000,
                  n_estimators=50, n_jobs=-1)

In [33]:
#Train accuracy

train_y_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8215546218487395


In [34]:
#Test accuracy

test_y_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.821078431372549


# Random Patches and Random Subspaces

In [35]:
#Random Patches: see the max_features variable
bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=500, max_features=14,
            max_samples=10000, bootstrap= False, n_jobs=-1) 

bag_clf.fit(train_x, train_y)

BaggingClassifier(base_estimator=SGDClassifier(), bootstrap=False,
                  max_features=14, max_samples=10000, n_estimators=500,
                  n_jobs=-1)

In [36]:
#Train accuracy

train_y_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.821764705882353


In [37]:
#Test accuracy

test_y_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8214705882352941


# Random forest classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, min_samples_split=4, min_samples_leaf=4,max_depth=14) 

rnd_clf.fit(train_x, train_y)

RandomForestClassifier(max_depth=14, min_samples_leaf=4, min_samples_split=4,
                       n_estimators=500, n_jobs=-1)

In [39]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8624789915966387


In [40]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8182352941176471


# AdaBoost Classifier

In [41]:
from sklearn.ensemble import AdaBoostClassifier 

#Create Adapative Boosting with Decision Stumps (depth=1)
ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=4, splitter="random", min_samples_leaf=7, max_leaf_nodes=6), n_estimators=500, 
            algorithm="SAMME.R", learning_rate=0.1) 

ada_clf.fit(train_x, train_y)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4,
                                                         max_leaf_nodes=6,
                                                         min_samples_leaf=7,
                                                         splitter='random'),
                   learning_rate=0.1, n_estimators=500)

In [42]:
#Train accuracy

train_y_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8375210084033613


In [43]:
#Test accuracy

test_y_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8307843137254902


# Stochastic Gradient Boosting Classifier

In [74]:
#Train on 75% of the sample
from sklearn.ensemble import GradientBoostingClassifier
gbclf = GradientBoostingClassifier(max_depth=3, n_estimators=1000, 
                                   learning_rate=0.1, subsample=0.75) 

gbclf.fit(train_x, train_y)

GradientBoostingClassifier(n_estimators=1000, subsample=0.75)

In [75]:
#Train accuracy

train_y_pred = gbclf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8516806722689075


In [76]:
#Test accuracy

test_y_pred = gbclf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8318627450980393
