In [1]:
import joblib
from collections import Counter

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import fbeta_score, make_scorer

In the end, we fit the optimal model using Logistic Regression with all the training data. And then we predict all the instances from testing data set by the final model. The results are saved as a text file.

## Load data

In [2]:
training_data = pd.read_csv('../data/ida_2016_training_set_update.csv', na_values=['na'])
testing_data = pd.read_csv('../data/ida_2016_test_set_update.csv', na_values=['na'], index_col=0)

In [3]:
# Split into X and y
X_train, y_train = training_data.drop(['class'], axis=1), training_data['class']
X_test = testing_data.drop(['class'], axis=1)

In [4]:
# Check the shape to see if having same columns
print(X_train.shape)
print(X_test.shape)

(60000, 170)
(16000, 170)


### Data imputation

In [5]:
# Import pickled imputer; SimpleImputer(missing_values=np.nan, strategy='median')
with open('../models/imputer.pkl','rb') as pickled_file:
    imputer = joblib.load(pickled_file)

In [6]:
# Imputate missing values using saved imputer
X_train_imputed = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

### Scaling data

In [7]:
# Load pickled scaler; preprocessing.MinMaxScaler(feature_range = (0, 1))
with open('../models/scaler.pkl','rb') as pickled_file: 
    scaler = joblib.load(pickled_file)

In [8]:
X_train_processed = X_train_imputed.copy()
X_test_processed = X_test_imputed.copy()

X_train_processed = scaler.transform(X_train_processed)
X_test_processed = scaler.transform(X_test_processed)

## Fitting the optimal model from previous seleciton - grid serach + logistic regression

In [9]:
# Define a 5-fold splits in applying CV, random_state is fixedb
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

In [10]:
# A grid of parameter 'C' in logistic regresssion
params_grid = {'C': np.power(10.0, np.arange(-5, 5))}

final_model = LogisticRegression(random_state=3, class_weight='balanced', max_iter=5000, verbose=1)

ftwo_scorer = make_scorer(fbeta_score, beta=2, pos_label='pos')
grid_search_cv_models = GridSearchCV(final_model, params_grid, cv=fold, scoring=ftwo_scorer)

In [11]:
# Fit the final model for logistic regression
grid_search_cv_models.fit(X_train_processed, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=3, shuffle=True),
             estimator=LogisticRegression(class_weight='balanced',
                                          max_iter=5000, random_state=3,
                                          verbose=1),
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04])},
             scoring=make_scorer(fbeta_score, beta=2, pos_label=pos))

In [12]:
print(f'The best score during training: {grid_search_cv_models.best_score_}, with params: {grid_search_cv_models.best_params_}')

The best score during training: 0.722465281931871, with params: {'C': 10.0}


### Make prediction

In [13]:
y_pred = grid_search_cv_models.predict(X_test_processed)

In [14]:
y_pred.shape

(16000,)

In [15]:
Counter(y_pred)

Counter({'neg': 15283, 'pos': 717})

In [18]:
# Save the results into txt file
pd.DataFrame(data={'id': X_test.index,'class': y_pred}).to_csv('../results.txt', index=False)