In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
pd.options.display.max_columns = 60

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import forest, RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, precision_recall_fscore_support

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from scipy import linalg
import time

In [None]:
train_raw = pd.read_csv('train_set.csv')
test_raw = pd.read_csv('test_set.csv')
#descr = pd.read_csv('features_description.csv')

In [None]:
train_raw.shape, test_raw.shape

In [None]:
train_raw['type'].value_counts()

In [None]:
test_raw['type'].value_counts()

Trying removing rows with only NA values

In [None]:
train_no_row = train_raw.dropna(axis = 0,thresh = 2).copy()
test_no_row = test_raw.dropna(axis = 0,thresh = 2).copy()

In [None]:
print(train_raw.shape[0] - train_no_row.shape[0],'rows are removed from train dataset')

In [None]:
print(test_raw.shape[0] - test_no_row.shape[0],'rows are removed from test dataset')

In [None]:
print('Removed from train dataset by type:','\n','\n',
      train_raw['type'].value_counts() - train_no_row['type'].value_counts())

In [None]:
print('Removed from test dataset by type:','\n','\n',
      test_raw['type'].value_counts() - test_no_row['type'].value_counts())

In [None]:
train_no_row['type'].value_counts()

In [None]:
test_no_row['type'].value_counts()

In [None]:
def preprocess_data (df):
    
    #Firstly, clean the data.
    isna_cols = [c for c in df.columns if df[c].isna().sum() == len(df)]
    df.drop(columns = isna_cols, inplace = True)
    df.drop(columns = 'date_1d', inplace = True)# only 1 unique date (2017-02-05)
    df.drop(columns = 'pct_data_consumption_eea', inplace = True)#corr 1.0 with 21 (pct_data_consumption_1d)
    df.drop(columns = 'sli_data_1d', inplace = True)#corr almost 1.0 with 24 (sli_1d)
    df.drop(columns = 'sli_data_eea', inplace = True)#corr almost 1.0 with 65 (sli_eea)
    df.drop(columns = 'sli_weight_Data_1d', inplace = True)#corr almost 1.0 with 79 (sli_weight_1d)
    df.drop(columns = 'sli_cause_data_1d', inplace = True)#duplicate of 25 (sli_cause_1d)
    df.drop(columns = 'sli_cause_top_data_eea', inplace = True)#duplicate of 54 (sli_cause_top_eea)
    df.drop(columns = 'sli_cause_detail_type_1d', inplace = True)#duplicate of 25 (sli_cause_1d)
    df.drop(columns = 'sli_cause_detail_type_top_eea', inplace = True)#duplicate of 34 (sli_cause_top_eea)
    df.drop(columns = 'sli_cause_detail_type_data_1d', inplace = True)#duplicate of 25 (sli_cause_1d)
    df.drop(columns = 'sli_cause_detail_type_top_data_eea', inplace = True)#duplicate of 54 (sli_cause_top_eea)
    df.drop(columns = 'file_sharing_count_dy_avg_eea', inplace = True)#corr almost 1.0 with 81 (sli_weight_MMS_1d)
    
    #counters
    is_counts = ['audio_count_1d', 'audio_count_dy_avg_eea',
                 'email_count_1d', 'email_count_dy_avg_eea',
                 'file_sharing_count_1d', 'file_sharing_count_dy_avg_eea',
                 'gaming_count_1d', 'gaming_count_dy_avg_eea',
                 'im_count_1d', 'im_count_dy_avg_eea',
                 'social_count_1d', 'social_count_dy_avg_eea',
                 'video_count_1d', 'video_count_dy_avg_eea',
                 'web_count_1d', 'web_count_dy_avg_eea']
    
    
    #Secondly, manage categorial values
    cat_hml = ('arpu_grp_eea', 'audio_count_grp_eea',
               'email_count_grp_eea', 'file_sharing_count_grp_eea',
               'gaming_count_grp_eea', 'im_count_grp_eea',
               'social_count_grp_eea', 'video_count_grp_eea',
               'web_count_grp_eea', 'sli_grp_eea') # [Low, Medium, High]
    
    cat_49_80_100 = ('pct_data_consumption_grp_eea', ) # [0-49, 80-100]
    
    cat_data_mms = ('sli_neg_impact_svc_1d', 'sli_neg_impact_svc_top_eea') # [data, MMS]
    
    cat_cic = ('sli_cause_1d', 'sli_cause_top_eea') # [Coverage, Internet, Congestion]
    
    cat_target = ('type', ) # [Mobile phone, Machine] - the target variable
    
    cat_dict = {cat_hml:['Low', 'Medium', 'High'],
                cat_49_80_100:['0-49', '80-100'],
                cat_data_mms:['data', 'MMS'],
                cat_cic:['Coverage', 'Internet', 'Congestion'],
                cat_target:['Mobile phone', 'Machine']
               }
    
    for col, cat in cat_dict.items():
        for c in col:
            df[c] = pd.Categorical(df[c].values, categories=cat)
            
    # Thirdly, fix NAs
    # counters - fill with 0, other with median
    # categorical - replace with its code and increment by 1 (to avoid negative values)
    for name, ser in df.copy().items():
        if (ser.dtype == 'float64') and (name not in is_counts):
            if pd.isna(ser).sum():
                #df[name+'_na'] = pd.isnull(ser).astype('float')# mark values that filled with median
                df[name] = ser.fillna(ser.median())
                
        elif (ser.dtype == 'float64') and (name in is_counts):
            if pd.isna(ser).sum():
                #df[name+'_na'] = pd.isnull(ser).astype('float')# mark values that filled with zeros
                df[name] = ser.fillna(0)
                
        elif ser.dtype != 'bool':
            if pd.isna(ser).sum():
                df[name] = ser.cat.codes + 1
    
    features = [c for c in df.columns if c != 'type']
    features_cat = cat_hml + cat_49_80_100 + cat_data_mms + cat_cic
    
    return df[features].values, df['type'].values.codes, features, features_cat

In [None]:
train_x, train_y, features, features_cat= preprocess_data(train_raw.copy())

In [None]:
test_x, test_y, _, _ = preprocess_data(test_raw.copy())

In [None]:
train_x.shape, test_x.shape

Random Forest with all rows

In [None]:
def set_batch(y, batch_size = 2, weight_of_true_labels = 0.5, seed = 213):
    """Gets indices for a batch."""
    
    np.random.seed(seed)
    true_labels_qty = int(batch_size * weight_of_true_labels)
    false_labels_qty = batch_size - true_labels_qty
    batch_inds = np.append(np.random.choice((np.where(y==0))[0], false_labels_qty, replace=True),
                           np.random.choice((np.where(y==1))[0], true_labels_qty, replace=True))
    return batch_inds

def set_rf_batches(y, batch_size = 2, weight_of_true_labels = 0.5):
    forest._generate_sample_indices = (
        lambda rs, n_samples: set_batch(y=y,
                                        batch_size = batch_size,
                                        weight_of_true_labels = weight_of_true_labels,
                                        seed = rs))

In [None]:
params = {'n_estimators':300,
          'criterion':'gini', #entropy, gini
          'max_depth':None,
          'min_samples_split':2,
          'min_samples_leaf':1,
          'min_weight_fraction_leaf':0.0,
          'max_features':0.5,
          'max_leaf_nodes':None,
          'min_impurity_decrease':0.0,
          'min_impurity_split':None,
          'bootstrap':True,
          'oob_score':False,
          'n_jobs':-1,
          'random_state':None,
          'verbose':0,
          'warm_start':False,
          'class_weight':None}

In [None]:
set_rf_batches(train_y, batch_size = 40000, weight_of_true_labels = 0.4)

In [None]:
rf = RandomForestClassifier(**params)

In [None]:
rf.fit(train_x, train_y)

In [None]:
preds = np.stack([t.predict_proba(test_x)[:,1] for t in rf.estimators_])

In [None]:
plt.plot([roc_auc_score(test_y, np.mean(preds[:i+1], axis = 0)) for i in range(len(preds))]);

In [None]:
plt.figure(figsize = (25,5))
plt.margins(x=0.01, y=0.1)
plt.plot(rf.feature_importances_, 'bo')
plt.xticks(np.arange(train_x.shape[1]), labels = features, fontsize = 'x-small', rotation = 90);

In [None]:
def print_metrics(model, t_x, v_x, t_y, v_y):
    train_pred = model.predict(t_x)
    test_pred = model.predict(v_x)
    
    print('train auc:', roc_auc_score(t_y, model.predict_proba(t_x)[:,1]))
    print('test auc:', roc_auc_score(v_y, model.predict_proba(v_x)[:,1]))
    print("-"*15)
    print('train accuracy:', (t_y == train_pred).sum() / len(t_y))
    print('test accuracy:', (v_y == test_pred).sum()/ len(v_y))
    print("-"*15)
    print('train f1 score:', f1_score(t_y, train_pred))
    print('test f1 score:', f1_score(v_y, test_pred))
    print("-"*15)
    print('train confusion matrix:','\n', confusion_matrix(t_y, train_pred, [1,0]).T)
    print('test confusion matrix:','\n', confusion_matrix(v_y, test_pred, [1,0]).T)
    return None

In [None]:
print_metrics(rf, train_x, test_x, train_y, test_y)

In [None]:
def generate_new_machine_data(train_rows_x,train_rows_y,nrof_samples=5000,nrof_singular_values=3):
    machine_idx = (train_rows_y==1)
    train_rows_machine = train_rows_x[machine_idx]
    
    U,singular_values, Vh = linalg.svd(train_rows_machine.T)
    singular_values = singular_values/np.max(singular_values)
    weighted_mat = np.matmul(np.diag(singular_values[0:nrof_singular_values]),np.random.randn(
        nrof_singular_values,nrof_samples))
    new_machine_data = np.dot(U[:,0:nrof_singular_values],weighted_mat)
    
    new_machine_data_label = np.ones((nrof_samples,))
    # Append the machine data to the training set
    train_rows_x_resampled = np.append(train_rows_x,new_machine_data.T,axis=0)
    
    # Append the corresponding lables
    train_rows_y_resampled = np.append(train_rows_y,new_machine_data_label,axis=0)
    
    return train_rows_x_resampled, train_rows_y_resampled, singular_values


Random Forest with removed rows

In [None]:
train_rows_x, train_rows_y, _, _ = preprocess_data(train_no_row.copy())
test_rows_x, test_rows_y, _, _ = preprocess_data(test_no_row.copy())

In [None]:
train_rows_x_resam, train_rows_y_resam, _= generate_new_machine_data(train_rows_x, train_rows_y, nrof_samples=10000,nrof_singular_values=3)

In [None]:
train_rows_x_resam.shape, train_rows_y_resam.shape

In [None]:
set_rf_batches(train_rows_y_resam, batch_size = 20000, weight_of_true_labels = 0.25)

In [None]:
rf_na_rows = RandomForestClassifier(**params)

In [None]:
start = time.time()
rf_na_rows.fit(train_rows_x_resam, train_rows_y_resam)
print(time.time() - start)

In [None]:
preds_no_rows = np.stack([t.predict_proba(test_rows_x)[:,1] for t in rf_na_rows.estimators_])

In [None]:
plt.plot([roc_auc_score(test_rows_y, np.mean(preds_no_rows[:i+1], axis = 0)) for i in range(len(preds_no_rows))])
plt.xlabel('Number of trees')
plt.ylabel('AUC');

In [None]:
plt.figure(figsize = (25,5))
plt.margins(x=0.01, y=0.1)
plt.plot(rf_na_rows.feature_importances_, 'bo')
plt.xticks(np.arange(train_rows_x_resam.shape[1]), labels = features, fontsize = 'x-small', rotation = 90);

In [None]:
print_metrics(rf_na_rows, train_rows_x_resam, test_rows_x, train_rows_y_resam, test_rows_y)

In [None]:
precision_recall_fscore_support(test_rows_y, rf_na_rows.predict(test_rows_x))

Logistic Regression

In [None]:
def preprocess_data_regr (df):
    
    #Firstly, clean the data.
    isna_cols = [c for c in df.columns if df[c].isna().sum() == len(df)]
    df.drop(columns = isna_cols, inplace = True)
    df.drop(columns = 'date_1d', inplace = True)# only 1 unique date (2017-02-05)
    df.drop(columns = 'pct_data_consumption_eea', inplace = True)#corr 1.0 with 21 (pct_data_consumption_1d)
    df.drop(columns = 'sli_data_1d', inplace = True)#corr almost 1.0 with 24 (sli_1d)
    df.drop(columns = 'sli_data_eea', inplace = True)#corr almost 1.0 with 65 (sli_eea)
    df.drop(columns = 'sli_weight_Data_1d', inplace = True)#corr almost 1.0 with 79 (sli_weight_1d)
    df.drop(columns = 'sli_cause_data_1d', inplace = True)#duplicate of 25 (sli_cause_1d)
    df.drop(columns = 'sli_cause_top_data_eea', inplace = True)#duplicate of 54 (sli_cause_top_eea)
    df.drop(columns = 'sli_cause_detail_type_1d', inplace = True)#duplicate of 25 (sli_cause_1d)
    df.drop(columns = 'sli_cause_detail_type_top_eea', inplace = True)#duplicate of 34 (sli_cause_top_eea)
    df.drop(columns = 'sli_cause_detail_type_data_1d', inplace = True)#duplicate of 25 (sli_cause_1d)
    df.drop(columns = 'sli_cause_detail_type_top_data_eea', inplace = True)#duplicate of 54 (sli_cause_top_eea)
    df.drop(columns = 'file_sharing_count_dy_avg_eea', inplace = True)#corr almost 1.0 with 81 (sli_weight_MMS_1d)
    
    #counters
    is_counts = ('audio_count_1d', 'audio_count_dy_avg_eea',
                 'email_count_1d', 'email_count_dy_avg_eea',
                 'file_sharing_count_1d',
                 'gaming_count_1d', 'gaming_count_dy_avg_eea',
                 'im_count_1d', 'im_count_dy_avg_eea',
                 'social_count_1d', 'social_count_dy_avg_eea',
                 'video_count_1d', 'video_count_dy_avg_eea',
                 'web_count_1d', 'web_count_dy_avg_eea')
    
    
    #Secondly, manage categorial values
    cat_hml = ('arpu_grp_eea', 'audio_count_grp_eea',
               'email_count_grp_eea', 'file_sharing_count_grp_eea',
               'gaming_count_grp_eea', 'im_count_grp_eea',
               'social_count_grp_eea', 'video_count_grp_eea',
               'web_count_grp_eea', 'sli_grp_eea') # [Low, Medium, High]
    
    cat_49_80_100 = ('pct_data_consumption_grp_eea', ) # [0-49, 80-100]
    
    cat_data_mms = ('sli_neg_impact_svc_1d', 'sli_neg_impact_svc_top_eea') # [data, MMS]
    
    cat_cic = ('sli_cause_1d', 'sli_cause_top_eea') # [Coverage, Internet, Congestion]
    
    cat_target = ('type', ) # [Mobile phone, Machine] - the target variable
    
    cat_dict = {cat_hml:['Low', 'Medium', 'High'],
                cat_49_80_100:['0-49', '80-100'],
                cat_data_mms:['data', 'MMS'],
                cat_cic:['Coverage', 'Internet', 'Congestion'],
                cat_target:['Mobile phone', 'Machine']
               }

    
    features_one_hot = list(cat_hml + cat_49_80_100 + cat_data_mms + cat_cic + is_counts)
    df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=True,
                   columns=features_one_hot, sparse=False, drop_first=False, dtype=None)
    df['type'] = pd.Categorical(df['type'].values, categories=cat_target)
    
    features = [c for c in df.columns if c != 'type']
    
    return df[features].values, df['type'].values, features

In [None]:
train_x_regr, train_y_regr, _ = preprocess_data_regr(train_raw.copy())

In [None]:
train_x_regr.shape

In [None]:
mm = MinMaxScaler().fit(train_x_regr)

In [None]:
train_x_minmax = mm.transform(train_x_oh)
test_x_minmax = mm.transform(train_x_oh)
train_x_minmax.shape, test_x_minmax.shape

In [None]:
logr_mm = LogisticRegression(penalty='l2', #‘newton-cg’, ‘sag’ and ‘lbfgs’ for l2. ‘liblinear’, ‘saga’ for l1.
                          dual=False,
                          tol=0.00001,
                          C=0.1,
                          fit_intercept=True,
                          intercept_scaling=1,
                          class_weight=None, #'balanced', None
                          random_state=None,
                          solver='lbfgs', #‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’
                          max_iter=3000, #For the newton-cg, sag and lbfgs solvers
                          multi_class='ovr',
                          verbose=0,
                          warm_start=False,
                          n_jobs=-1)

In [None]:
start = time.time()
logr_mm.fit(train_x_minmax, train_y)
print(time.time() - start)

In [None]:
print_metrics(logr_mm, train_x_minmax, test_x_minmax, train_y, test_y)

Logistic Regression with removed rows

In [None]:
mm_rows = MinMaxScaler().fit(train_rows_x)

In [None]:
train_rows_x_minmax = mm_rows.transform(train_rows_x)
test_rows_x_minmax = mm_rows.transform(test_rows_x)
train_rows_x_minmax.shape, test_rows_x_minmax.shape

In [None]:
logr_mm_rows = LogisticRegression(penalty='l2', #‘newton-cg’, ‘sag’ and ‘lbfgs’ for l2. ‘liblinear’, ‘saga’ for l1.
                          dual=False,
                          tol=0.00001,
                          C=0.1,
                          fit_intercept=True,
                          intercept_scaling=1,
                          class_weight=None, #'balanced', None
                          random_state=None,
                          solver='lbfgs', #‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’
                          max_iter=3000, #For the newton-cg, sag and lbfgs solvers
                          multi_class='ovr',
                          verbose=0,
                          warm_start=False,
                          n_jobs=-1)

In [None]:
start = time.time()
logr_mm_rows.fit(train_rows_x_minmax, train_rows_y)
print(time.time() - start)

In [None]:
print_metrics(logr_mm_rows, train_rows_x_minmax, test_rows_x_minmax, train_rows_y, test_rows_y)

KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn.fit(train_rows_x_minmax, train_rows_y)

In [None]:
def print_metrics_knn(model, t_x, v_x, t_y, v_y):
    
    train_pred = model.predict(t_x)
    test_pred = model.predict(v_x)

    print('train accuracy:', (t_y == train_pred).sum() / len(t_y))
    print('test accuracy:', (v_y == test_pred).sum()/ len(v_y))
    print("-"*15)
    print('train f1 score:', f1_score(t_y, train_pred))
    print('test f1 score:', f1_score(v_y, test_pred))
    print("-"*15)
    print('train confusion matrix:','\n', confusion_matrix(t_y, train_pred, [1,0]).T)
    print('test confusion matrix:','\n', confusion_matrix(v_y, test_pred, [1,0]).T)
    return None

In [None]:
start = time.time()
print_metrics_knn(knn, train_rows_x_minmax, test_rows_x_minmax, train_rows_y, test_rows_y)
print(time.time() - start)

Lightgbm

In [None]:
train_data = lgb.Dataset(train_rows_x_resam, label = train_rows_y_resam, feature_name=features, categorical_feature='auto')
test_data = lgb.Dataset(test_rows_x, label = test_rows_y, feature_name=features, categorical_feature='auto')

In [None]:
lm_param = {'num_leaves':31,
            'objective':'binary',
            'is_unbalance':True,
            'num_iterations':100,
            'learning_rate':0.05,
            'feature_fraction':0.5,
            'early_stopping_round':10,
            'verbosity':1}
lm_param['metric'] = 'auc'

In [None]:
start = time.time()
lm = lgb.train(lm_param, train_data, valid_sets=test_data)
print(time.time() - start)

In [None]:
def print_metrics_lgb(model, t_x, v_x, t_y, v_y):
    
    train_pred = model.predict(t_x, num_iteration=lm.best_iteration) > 0.5
    test_pred = model.predict(v_x, num_iteration=lm.best_iteration) > 0.5

    print('train accuracy:', (t_y == train_pred).sum() / len(t_y))
    print('test accuracy:', (v_y == test_pred).sum()/ len(v_y))
    print("-"*15)
    print('train f1 score:', f1_score(t_y, train_pred))
    print('test f1 score:', f1_score(v_y, test_pred))
    print("-"*15)
    print('train confusion matrix:','\n', confusion_matrix(t_y, train_pred, [1,0]).T)
    print('test confusion matrix:','\n', confusion_matrix(v_y, test_pred, [1,0]).T)
    return None

In [None]:
print_metrics_lgb(lm, train_rows_x_resam, test_rows_x, train_rows_y_resam, test_rows_y)

Combined models

In [None]:
rf_stack = rf_na_rows.predict_proba(test_rows_x)
lm_stack = lm.predict(test_rows_x)
log_mm_stack = logr_mm_rows.predict_proba(test_rows_x_minmax)

print('rf_stack auc:', roc_auc_score(test_rows_y, rf_stack[:,1]))
print('lr_stack auc:', roc_auc_score(test_rows_y, log_mm_stack[:,1]))
print('lgb_stack auc:', roc_auc_score(test_rows_y, lm_stack))
print('-'*15)
print('average rf+lr auc:', roc_auc_score(test_rows_y, (rf_stack[:,1] + log_mm_stack[:,1]) / 2))
print('average rf+lgb auc:', roc_auc_score(test_rows_y, (rf_stack[:,1] + lm_stack) / 2))
print('average lr+lgb auc:', roc_auc_score(test_rows_y, (log_mm_stack[:,1] + lm_stack) / 2))
print('-'*15)
print('average rf+lr+lgb auc:', roc_auc_score(test_rows_y, (rf_stack[:,1] + log_mm_stack[:,1] + lm_stack) / 3))

In [None]:
confusion_matrix(test_rows_y, ((rf_stack[:,1] + log_mm_stack[:,1]) / 2) > 0.5, [1,0]).T