In [1]:
# general:
import pandas as pd
import numpy as np
import catboost as cat
# self-written functions:
import sys
sys.path.append("../data_process/")
import ML_func as ml
import data_cleaner as dc

In [2]:
test = dc.load_py('../../clean_data/test.pkl')   # CatBoost & light GBM use categorical
train = dc.load_py('../../clean_data/train.pkl')
x_test_dummy = dc.load_py('../../clean_data/test_dummy.pkl')    # XGBoost use one-hot encoding
x_train_dummy = dc.load_py('../../clean_data/train_dummy.pkl')
x_test_dummy.pop('progress')   
x_train_dummy.pop('progress')
x_test = test.copy()
x_train = train.copy()
y_test = x_test.pop('progress')
y_train = x_train.pop('progress')
cat_features=['PTGENDER', 'PTETHCAT', 'PTMARRY', 'PTRACCAT']

In [3]:
base_models = {}  # to save all models

### Logistic & Lasso regression

In [4]:
from sklearn.linear_model import LogisticRegression
mod_log = LogisticRegression(penalty=None, max_iter=500,random_state=1)
mod_log.fit(x_train_dummy, y_train)
base_models['Logistic Regression'] = mod_log

In [5]:
np.mean(mod_log.predict(x_test_dummy) == y_test)

0.9196675900277008

In [6]:
mod_lasso = LogisticRegression(penalty='l2', max_iter=500,random_state=1)
mod_lasso.fit(x_train_dummy, y_train)
base_models['Lasso Regression'] = mod_lasso
np.mean(mod_lasso.predict(x_test_dummy) == y_test)

0.9196675900277008

### Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
mod_rf = RandomForestClassifier(n_estimators=300, random_state=1)
mod_rf.fit(x_train_dummy, y_train)
base_models['Random Forest'] = mod_rf
np.mean(mod_rf.predict(x_test_dummy) == y_test)

0.925207756232687

### SVM

In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
mod_svm = Pipeline([
    ("scale", StandardScaler() ),
    ('model', SVC(kernel='linear', probability=True,random_state=1)),
])
mod_svm.fit(x_train_dummy, y_train)
base_models['SVM'] = mod_svm
np.mean(mod_svm.predict(x_test_dummy) == y_test)

0.9168975069252078

### GBM

In [9]:
from sklearn.ensemble import HistGradientBoostingClassifier
mod_gbm = HistGradientBoostingClassifier(random_state=1)
mod_gbm.fit(x_train_dummy, y_train)
base_models['GBM'] = mod_gbm
np.mean(mod_gbm.predict(x_test_dummy) == y_test)

0.9085872576177285

In [10]:
dc.save_py(base_models, 'models/baseline5')