In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

In [45]:
def plot_boundaries(model, X, y, formatter):
    model.fit(X.values, y)

    # plot the decision boundary
    x_min, x_max = X['Weight'].min() - 0.5, X['Weight'].max() + 0.5
    y_min, y_max = X['Height'].min() - 0.1, X['Height'].max() + 0.1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                            np.arange(y_min, y_max, 0.1))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X['Weight'], X['Height'], c=y, s=20, edgecolor='k')
    plt.colorbar(ticks=[0, 1, 2, 3], format=formatter)
    plt.show()

In [83]:
df = pd.read_csv('../data/train_data.csv')
numerical_features = ['Weight', 'Height', 'Age', 'Meal_Count', 'Phys_Act', 'Water_Consump']
categorical_features = ['Smoking', 'Alcohol_Consump', 'Transport']
target = 'Body_Level'

df = df[numerical_features + categorical_features + [target]]


In [84]:
# encode categorical features
df = pd.get_dummies(df, columns=categorical_features)

# normalize all features
for feature in list(df.columns):
    if feature == target:
        continue
    df[feature] = (df[feature] - df[feature].mean()) / df[feature].std()
df.head()

Unnamed: 0,Weight,Height,Age,Meal_Count,Phys_Act,Water_Consump,Body_Level,Smoking_no,Smoking_yes,Alcohol_Consump_Always,Alcohol_Consump_Frequently,Alcohol_Consump_Sometimes,Alcohol_Consump_no,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking
0,-0.128542,0.027982,1.416575,0.124275,1.318959,1.577032,Body Level 3,0.146997,-0.146997,-0.029099,-0.194303,-1.383628,1.506501,1.895333,-0.077185,-0.077185,-1.720633,-0.152896
1,0.732399,-0.981026,0.280407,0.380924,-1.21243,0.277153,Body Level 4,0.146997,-0.146997,-0.029099,-0.194303,0.722126,-0.663228,-0.527165,-0.077185,-0.077185,0.580689,-0.152896
2,-1.205054,-1.094015,-1.003109,1.648955,1.143733,-0.012223,Body Level 2,0.146997,-0.146997,-0.029099,-0.194303,0.722126,-0.663228,-0.527165,-0.077185,-0.077185,0.580689,-0.152896
3,1.237363,0.575239,1.019639,0.380924,0.055478,0.373144,Body Level 4,0.146997,-0.146997,-0.029099,-0.194303,0.722126,-0.663228,1.895333,-0.077185,-0.077185,-1.720633,-0.152896
4,0.953187,0.446082,2.607055,0.191922,-0.132113,-0.220097,Body Level 4,0.146997,-0.146997,-0.029099,-0.194303,-1.383628,1.506501,1.895333,-0.077185,-0.077185,-1.720633,-0.152896


In [85]:
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']
y = y.map({'Body Level 1': 0, 'Body Level 2': 1, 'Body Level 3': 2, 'Body Level 4': 3})
formatter = plt.FuncFormatter(lambda val, loc: ['Body Level 1', 'Body Level 2', 'Body Level 3', 'Body Level 4'][val])


In [68]:
# hyperparameters search
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for c in C:
    lr = LogisticRegression(C=c, max_iter=10000)
    cv_results = cross_validate(lr, X, y, cv=10, return_train_score=True)
    print(f"for C={c} the mean train score is {cv_results['train_score'].mean().round(2)} and the mean test score is {cv_results['test_score'].mean().round(2)}")

for C=0.001 the mean train score is 0.47 and the mean test score is 0.46
for C=0.01 the mean train score is 0.73 and the mean test score is 0.71
for C=0.1 the mean train score is 0.89 and the mean test score is 0.87
for C=1 the mean train score is 0.95 and the mean test score is 0.94
for C=10 the mean train score is 0.98 and the mean test score is 0.97
for C=100 the mean train score is 0.99 and the mean test score is 0.98
for C=1000 the mean train score is 0.99 and the mean test score is 0.98
for C=10000 the mean train score is 1.0 and the mean test score is 0.98


In [96]:
# get feature importance
lr = LogisticRegression(C=0.1, max_iter=10000)
lr.fit(X, y)
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': (abs(lr.coef_[0]) / np.abs(lr.coef_[0]).sum()).round(2)})
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance

Unnamed: 0,feature,importance
0,Weight,0.57
1,Height,0.15
2,Age,0.08
9,Alcohol_Consump_Frequently,0.05
13,Transport_Bike,0.03
3,Meal_Count,0.03
14,Transport_Motorbike,0.03
11,Alcohol_Consump_no,0.02
4,Phys_Act,0.01
5,Water_Consump,0.01


5.403087781447567