# 1. Preprocessing and handling the data

Before applying boosting as a model on the data we're gonne preprocess and seperate the data and it's labels

In [20]:
# Cell for just the required import or other types of setup needed for the notebook
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
from utils import gen_csv_from_pred

In [21]:
test_data = pd.read_csv('data/test.csv')
df = pd.read_csv('data/train.csv')
# df['Lead'].replace(['Male', 'Female'], [0, 1], inplace=True)
y,X = df['Lead'], df.drop(columns=['Lead'])# Can any other columns be dropped?
# FIXME: (potentially) this can be uneccessary since we already have two data sets of unlabeled examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. Using a default model on the data

In [22]:
clf = AdaBoostClassifier() # default estimator val
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
test_pred = clf.predict(test_data)
temp = y_test.to_list()

# Getting a score against another splitted dataset
correct = 0
for i in range(len(pred)):
    if pred[i] == temp[i]:
        correct += 1
print(correct / len(pred))
gen_csv_from_pred(test_pred, "boosting")

0.8509615384615384


# 3. Testing multiple classifier methods

In [23]:
#Generate all estimator with the predict_proba, function required by boosting
# yoinked from here: https://stackoverflow.com/questions/30056331/how-to-list-all-scikit-learn-classifiers-that-support-predict-proba
# also possible to use CalibratedClassifierCV to make any classifier have such a method
from sklearn.utils import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)

AdaBoostClassifier
BaggingClassifier
BayesianGaussianMixture
BernoulliNB
CalibratedClassifierCV
CategoricalNB
ClassifierChain
ComplementNB
DecisionTreeClassifier
DummyClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GaussianMixture
GaussianNB
GaussianProcessClassifier
GradientBoostingClassifier
GridSearchCV
HalvingGridSearchCV
HalvingRandomSearchCV
HistGradientBoostingClassifier
KNeighborsClassifier
LabelPropagation
LabelSpreading
LinearDiscriminantAnalysis
LogisticRegression
LogisticRegressionCV
MLPClassifier
MultiOutputClassifier
MultinomialNB
NuSVC
OneVsRestClassifier
Pipeline
QuadraticDiscriminantAnalysis
RFE
RFECV
RadiusNeighborsClassifier
RandomForestClassifier
RandomizedSearchCV
SGDClassifier
SVC
SelfTrainingClassifier
StackingClassifier
VotingClassifier


In [24]:
from sklearn import linear_model
from sklearn import svm
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
from utils import gen_csv_from_pred
df = pd.read_csv('data/train.csv')
# df['Lead'].replace(['Male', 'Female'], [0, 1], inplace=True)
y,X = df['Lead'], df.drop(columns=['Lead'])
logreg = linear_model.LogisticRegression() 
svc = svm.LinearSVC()
models = [None, logreg, svc]
model_names = ["Default AdaBoost",  "Logistic Regression", "Linear Support Vector Machine"]
param_dicts = [
                {
                    'n_estimators':[1,5,10,15,25,35,50,100,1000],
                    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
                },
                [
                    {
                    'tol':[1e-5,1e-4,1e-3,1e-2],
                    'C':[1e-4,1e-3,1e-2,1e-1,1], 
                    'solver':('saga','lbfgs'),
                    'max_iter':[100,1000,10000]
                    },
                    {
                    'tol':[1e-5,1e-4,1e-3,1e-2],
                    'C':[1e-4,1e-3,1e-2,1e-1,1], 
                    'penalty':('l1','l2'),
                    'solver':('liblinear','saga'),
                    'max_iter':[100,1000,10000]
                    }
                ],
                [
                    {
                    'tol':[1e-5,1e-4,1e-3,1e-2],
                    'C':[1e-4,1e-3,1e-2,1e-1,1], 
                    'penalty':['l2'],
                    'loss':('hinge','squared_hinge'),
                     'max_iter':[100,1000,10000]
                    },
                    {
                    'tol':[1e-5,1e-4,1e-3,1e-2],
                    'C':[1e-4,1e-3,1e-2,1e-1,1], 
                     'max_iter':[100,1000,10000]
                    }
                ]
                ]  
gridmodels = []
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
#Ignores stdout from models about convergence
@ignore_warnings(category=ConvergenceWarning)
def test():
    for i in range(len(models)):
        item = models[i]
        param = param_dicts[i]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        preds = []
        correct = 0
        temp = y_test.to_list()
        if item is None:
            clf = AdaBoostClassifier() # default estimator val)
            testgrid = GridSearchCV(estimator = clf, param_grid = param)
            testgrid.fit(X_train, y_train)
            gridmodels.append(testgrid)
            clf.fit(X_train, y_train)
            preds = (clf.predict(X_test))
        else:
            clf = AdaBoostClassifier(item, algorithm='SAMME')
            testgrid = GridSearchCV(estimator = item, param_grid = param,refit=True)
            testgrid.fit(X_train, y_train)
            gridmodels.append(testgrid)
            clf.fit(X_train,y_train)
            preds = (clf.predict(X_test)) 
        
        for j in range(len(preds)):
            if preds[j] == temp[j]:
                correct += 1
        print(f"{model_names[i]}: {correct / len(preds)}")
test()


Default AdaBoost: 0.8798076923076923
Logistic Regression: 0.8413461538461539
Linear Support Vector Machine: 0.8028846153846154


# 4. Using the results from GridSearchCv

In [25]:
for item in gridmodels:
    est = item.best_estimator_
    scr = item.best_score_
    par = item.best_params_
    print("Best estimator was: ", est)
    print("Best score was: ",scr)
    print("Best params was: ",par)
    

Best estimator was:  AdaBoostClassifier(learning_rate=0.9)
Best score was:  0.8688622754491018
Best params was:  {'learning_rate': 0.9, 'n_estimators': 50}
Best estimator was:  LogisticRegression(C=0.1, max_iter=1000, tol=1e-05)
Best score was:  0.876055118678306
Best params was:  {'C': 0.1, 'max_iter': 1000, 'solver': 'lbfgs', 'tol': 1e-05}
Best estimator was:  LinearSVC(C=0.001, max_iter=10000, tol=1e-05)
Best score was:  0.812279056345141
Best params was:  {'C': 0.001, 'loss': 'squared_hinge', 'max_iter': 10000, 'penalty': 'l2', 'tol': 1e-05}


In [26]:
from sklearn import linear_model
from sklearn import svm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
from utils import gen_csv_from_pred
df = pd.read_csv('data/train.csv')
# df['Lead'].replace(['Male', 'Female'], [0, 1], inplace=True)
# print(df.info)
y,X = df['Lead'], df.drop(columns=['Lead'])
logreg = linear_model.LogisticRegression(C=1, penalty='l1', solver='liblinear', tol=0.01,max_iter=10000)
svc = svm.LinearSVC(C=0.0001, max_iter=100000, tol=1e-05)
models = [None, logreg, svc]
model_names = ["Tuned AdaBoost",  "Tuned Logistic Regression", "Tuned Linear Support Vector Machine"]

for i in range(len(models)):
    item = models[i]
    param = param_dicts[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    preds = []
    correct = 0
    temp = y_test.to_list()
    if item is None:
        clf = AdaBoostClassifier(learning_rate=0.9,n_estimators=25) # default estimator val)
        clf.fit(X_train, y_train)
        preds = (clf.predict(X_test))
    else:
        clf = AdaBoostClassifier(item,algorithm='SAMME')
        clf.fit(X_train,y_train)
        preds = (clf.predict(X_test)) 
    
    for j in range(len(preds)):
        if preds[j] == temp[j]:
            correct += 1
    print(f"{model_names[i]}: {correct / len(preds)}")

Tuned AdaBoost: 0.8076923076923077
Tuned Logistic Regression: 0.7692307692307693
Tuned Linear Support Vector Machine: 0.7836538461538461
