In [24]:
import pandas as pd

# sklearn
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import util

In [25]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np


class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, threshold=[18.5, 24.9, 29.9]):
        self.threshold = threshold

    def fit(self, X, y):
        pass

    def predict(self, X):
        # convert x to numpy array
        X = np.array(X)
        y_pred = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            if X[i][0] < self.threshold[0]:
                y_pred[i] = 0
            elif X[i][0] < self.threshold[1]:
                y_pred[i] = 1
            elif X[i][0] < self.threshold[2]:
                y_pred[i] = 2
            else:
                y_pred[i] = 3
        return y_pred


In [26]:
X, y = util.getData(path='../data/data.csv', scaleNumericalFeatures=False)
X['weightOverHeightSquared'] = X['Weight'] / X['Height'] ** 2

In [4]:
len(X)

1477

In [5]:
y_threshold = ThresholdClassifier().predict(X[['weightOverHeightSquared']])
# threshold scores
print(f"Accuracy: {accuracy_score(y_threshold, y):0.3f}")
print(f"F1: {f1_score(y_threshold, y, average='weighted'):0.3f}")

Accuracy: 0.988
F1: 0.988


In [7]:
# tunning logistic regression with grid search with weightOverHeightSquared feature

# logistic regression with grid search
param_grid = {'C': [1, 10, 100, 1000]}
grid = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=10, scoring='f1_weighted')
grid.fit(X[['weightOverHeightSquared']], y)

print('Best parameters: ', grid.best_params_)
print('Best score: ', grid.best_score_)

Best parameters:  {'C': 100}
Best score:  0.9858065775068725


In [8]:
# compare logistic out with threshold
y_logistic = grid.predict(X[['weightOverHeightSquared']])
print(f"Accuracy: {accuracy_score(y_logistic, y_threshold):0.3f}")

Accuracy: 0.991


In [28]:
# tunning svm with grid search with weightOverHeightSquared feature

# svm with grid search with different kernels
param_grid = [
  {'C': [100, 10, 1], 'kernel': ['linear']},
  {'C': [100, 10, 1], 'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001]},
  {'C': [100, 10, 1], 'kernel': ['sigmoid'], 'gamma': [0.1, 0.01, 0.001]},
  {'C': [10, 1], 'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': [0.1, 0.01, 0.001]}
 ]
grid = GridSearchCV(SVC(), param_grid, cv=10, scoring='f1_weighted', n_jobs=5)

grid.fit(X[['weightOverHeightSquared']], y)

print('Best parameters: ', grid.best_params_)
print('Best score: ', grid.best_score_)

Best parameters:  {'C': 10, 'degree': 3, 'gamma': 0.1, 'kernel': 'poly'}
Best score:  0.9871789908190554


In [30]:
# compare svm out with threshold
y_svm = grid.predict(X[['weightOverHeightSquared']])
print(f"Accuracy: {accuracy_score(y_svm, y_threshold):0.3f}")

Accuracy: 0.991


In [13]:
# tunning rf with grid search with weightOverHeightSquared feature

# random forest with grid search
param_grid = {'n_estimators': [100, 200, 300, 400], 'max_depth': [10, 20, 30, 40, 50]}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=10, scoring='f1_weighted', n_jobs=5)
grid.fit(X[['weightOverHeightSquared']], y)

print('Best parameters: ', grid.best_params_)
print('Best score: ', grid.best_score_)

Best parameters:  {'max_depth': 10, 'n_estimators': 100}
Best score:  0.9776777730153778


In [14]:
# compare rf out with threshold
y_rf = grid.predict(X[['weightOverHeightSquared']])
print(f"Accuracy: {accuracy_score(y_rf, y_threshold):0.3f}")

Accuracy: 0.988
