In [1]:
import numpy as np
import pandas as pd
import joblib

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

## Reading and splitting stratified sample
see [prev notebook](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/notebooks/get_sample.ipynb)

In [2]:
sample = (pd.read_csv("https://raw.githubusercontent.com/woldemarg/ds_tests/master/machine_learning/company_3/task_solution/derived/sample.csv")
          .drop(["id"], axis=1))

y_sample = sample["gb"]
X_sample = sample.copy()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_sample,
                                                    y_sample,
                                                    test_size=0.2,
                                                    random_state=1234,
                                                    stratify=y_sample)

## Engineering features

In [4]:
#thresholds to drop unavailing features
th_nans = 0.35            #share of nan's per column
th_high_cardinality = 10  #num of categories percolumn
th_low_variance = 0.1     #std of a given feature
th_corr = 0.05            #correlation of a given feature with target 

### * detecting columns with more than 35% of missing values

In [5]:
cols_nans = (X_train
            .columns[X_train.isna().mean() > th_nans]
            .tolist())

### * detecting categorical features with more than 10 categories 

In [None]:
cats = X_train.columns[X_train.columns.str.startswith("cat")]

X_train.loc[:, cats] = X_train.loc[:, cats].astype(str)
X_test.loc[:, cats] = X_test.loc[:, cats].astype(str)

In [8]:
cols_high_cardinality = (cats[X_train[cats]
                              .nunique() > th_high_cardinality]
                         .tolist())

In [9]:
cols_to_drop_init = list(set(cols_nans) | set(cols_high_cardinality))

In [10]:
X_train = X_train.drop(cols_to_drop_init, axis=1)
X_test = X_test.drop(cols_to_drop_init, axis=1)

### * imputing missing valuea for the rest of the columns
Both for train and test sets we use *means* for numbers and *modes* for strings calculated strictly on the **train** set to prevent data leakage

In [11]:
cats_left = X_train.select_dtypes(include="object").columns.tolist()
nums_left = X_train.select_dtypes(include=np.number).columns.tolist()

In [12]:
X_train.loc[:, cats_left] = (X_train.loc[:, cats_left]
                             .fillna(X_train.loc[:, cats_left].mode()))

X_test.loc[:, cats_left] = (X_test.loc[:, cats_left]
                            .fillna(X_train.loc[:, cats_left].mode()))

In [13]:
X_train.loc[:, nums_left] = (X_train.loc[: , nums_left]
                             .fillna(X_train.loc[:, nums_left].mean()))

X_test.loc[:, nums_left] = (X_test.loc[:, nums_left]
                            .fillna(X_train.loc[:, nums_left].mean()))

In [14]:
print(X_train.isna().sum().sum())
print(X_test.isna().sum().sum())

0
0


### * one-hot encoding categorical features with simple Pandas 

In [15]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

X_train, X_test = X_train.align(X_test,
                                join="left",
                                axis=1,
                                fill_value=0)

### * detecting fetures with low variance and low corretation with target 

In [16]:
cols_low_std = (X_train
                .columns[(X_train.std() < th_low_variance) &
                         (X_train.columns != "gb")]
                .tolist())

In [17]:
cols_low_corr = (X_train
                .columns[(X_train.corr().abs()["gb"] < th_corr) &
                         (X_train.columns != "gb")]
                .tolist())

In [18]:
cols_to_drop_final = list(set(cols_low_std) | set(cols_low_corr))
cols_to_drop_final.append("gb")

In [19]:
X_train = X_train.drop(cols_to_drop_final, axis=1)
X_test = X_test.drop(cols_to_drop_final, axis=1)

X_train.shape

(5364, 48)

So far we have managed to reduce number of features to *48* and leave only those ofthem to be considered significant from pure *technical* point of view (since we have nor column description neither can guess their meaning from context) 

## Building and evaluating simple baseline model

In [20]:
target = y_train.value_counts()
spw = target[0] / target[1]

xgb_model = XGBClassifier(random_state=1234,
                          objective="binary:logistic",
                          scale_pos_weight=spw,
                          n_jobs=-1)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print("ROC-AUC on test: {}".format(roc_auc_score(y_test, y_pred)))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

ROC-AUC on test: 0.8570376016260162
Confusion matrix:
[[1243   69]
 [   7   23]]


Even though the ROC-AUC score is only **0.86** (possibly we can improve it further with hyper-parameters tuning), the Confusion matrix is worthy of special attention. With a highly imbalanced data, we've succeeded to *correctly predict* most of the values from minor class, hence we really don't know the business cost of FN and FP errors.

Now we can proceed with building a simple pipeline based on above-defined steps - see [next notebook](https://github.com/woldemarg/ds_tests/blob/master/machine_learning/company_3/task_solution/scripts/notebooks/model_pipeline.ipynb)

In [None]:
# joblib.dump(xgb_model, "machine_learning/company_3/task_solution/derived/base_model.sav")