In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import xgboost as xgb


In [2]:
data = pd.read_csv('final_data.csv')

In [3]:
data.columns

Index(['Unnamed: 0', 'person', 'offer_id', 'successful', 'age', 'id',
       'became_member_on', 'income', 'age_group_1', 'age_group_2',
       'age_group_3', 'age_group_4', 'income_group_1', 'income_group_2',
       'income_group_3', 'income_group_4', 'gender_group_M', 'gender_group_O',
       'time', 'time_group_1', 'time_group_2', 'time_group_3', 'time_group_4'],
      dtype='object')

In [4]:
cols_to_keep = ['age_group_1', 'age_group_2','age_group_3', 'age_group_4', 
                'income_group_2', 'income_group_3', 'income_group_4', 'gender_group_M', 
                'gender_group_O',
                'time_group_1', 'time_group_2', 'time_group_3', 'time_group_4']

In [5]:
X = data[cols_to_keep]
y=data['successful']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Instantiate classifiers

In [7]:
rfc = RandomForestClassifier()
lr = LogisticRegression()
xgbc =  xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False)

### Train classifiers

In [None]:
# default parameters

In [46]:
rfc.fit(X_train, y_train)
lr.fit(X_train, y_train)
xgbc.fit(X_train, y_train, eval_metric='logloss')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [49]:
rfc_y_pred = rfc.predict(X_test)
lr_y_pred = lr.predict(X_test)
xgbc_y_pred = xgbc.predict(X_test)

In [50]:
np.mean(rfc_y_pred == y_test), np.mean(lr_y_pred == y_test), np.mean(xgbc_y_pred == y_test)

(0.7247148288973384, 0.7095901985635826, 0.7298690325306295)

In [10]:
# find best parameters with the use of cross validation

param_rfc = {'bootstrap': [True,False],
             'ccp_alpha': [0.0, 0.5],
             'max_depth': [None, 3],
             'max_leaf_nodes': [None,3] ,
             'min_samples_split': [2,3],
             'n_estimators': [100,150]}

param_lr = {
             'l1_ratio': [None,0.2,0.5],
             'max_iter': [100,200],
             'penalty': ['l1', 'l2', 'elasticnet', 'none'],
             }

param_xgbc = {
                 'learning_rate': [0.2, 0.5],
                 'max_depth': [None, 3],
                 'n_estimators': [100,150],
                 'num_parallel_tree': [2,5],
             }

cv_rfc = GridSearchCV(rfc, param_rfc)
cv_lr = GridSearchCV(lr, param_lr)
cv_xgbc = GridSearchCV(xgbc, param_xgbc)

cv_rfc.fit(X_train, y_train)
cv_lr.fit(X_train, y_train)
cv_xgbc.fit(X_train, y_train)

cv_rfc.best_params_
cv_lr.best_params_
cv_xgbc.best_params_


Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\

  "(penalty={})".format(self.penalty))
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit

  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))
  "(penalty={})".format(self.penalty))








{'learning_rate': 0.5,
 'max_depth': 3,
 'n_estimators': 100,
 'num_parallel_tree': 2}

In [9]:
cv_rfc.best_params_

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'max_depth': None,
 'max_leaf_nodes': None,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
lr.get_params()

In [54]:
xgbc.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 12,
 'num_parallel_tree': 1,
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [None]:
cv_predicted = cv.predict(X_test)

## Validation charts

In [None]:
probas_pred = cv_rfc.predict_probas(X_train)
precision_recall_curve(y_true, probas_pred)

In [None]:
probas_pred = cv_lr.predict_probas(X_train)
precision_recall_curve(y_true, probas_pred)

In [None]:
probas_pred = cv_xgbc.predict_probas(X_train)
precision_recall_curve(y_true, probas_pred)

## Accuraccy and F1-score

In [None]:
y_pred = cv_rfc.predict(X_train)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
y_pred = cv_lr.predict(X_train)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
y_pred = cv_xgbc.predict(X_train)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

## TODO: feature selection