In [1]:
import pandas as pd
import sklearn as sklearn
import numpy as np
import time
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lg
import optuna
import xgboost as xgb
import pickle
import os
from datetime import date

from typing import List, TypeVar, Dict
import abc

from sklearn.pipeline import Pipeline
from pandas.core.frame import DataFrame
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedKFold
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, roc_auc_score, precision_score, average_precision_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from numpy import argmax

## Helper functions

In [2]:
_СORRELATION_MATRIX_PATH_ = 'corr_matrix.pcl'
_СHURN_PATH_ = 'churn_model.pcl'
_CHURN_PATH_REDUCED_ = 'churn_model_reduced.pcl' 
_MAX_FEATURES_ = 100

In [3]:
# https://towardsdatascience.com/interpreting-roc-curve-and-roc-auc-for-classification-evaluation-28ec3983f077
# https://neptune.ai/blog/f1-score-accuracy-roc-auc-pr-auc
# https://www.statology.org/plot-roc-curve-python/
class Metrics:
    
    def roc_auc(y_true, predicted):
        return roc_auc_score(y_true, predicted)
    
    def auc(y_true, predicted):
        return average_precision_score(y_true, predicted)
    
    def plot_auc(y_true, predicted):
        fpr, tpr, _ = metrics.roc_curve(y_true, predicted)
        auc = metrics.roc_auc_score(y_true, predicted)
        plt.plot(fpr,tpr,label="AUC="+str(auc))
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.legend(loc=4)
        plt.show()
    
    def classification_report(y_true, predicted):
        return classification_report(y_true, predicted)
    
    def confusion_matrix(y_true, predicted):
        return confusion_matrix(y_true, predicted)

In [4]:
def get_float_cols(df:DataFrame) -> List[str]:
    return df.select_dtypes(include=float).columns.tolist()

def get_int_cols(df:DataFrame) -> List[str]:
    return df.select_dtypes(include=int).columns.tolist()

def get_number_cols(df:DataFrame) -> List[str]:
    return df.select_dtypes(np.number).columns.tolist()

def get_obj_cols(df:DataFrame) -> List[str]:
    return list(df.select_dtypes(include=object).columns)

def print_empty_values(df:DataFrame):
    col_names_with_na = list(df.isna().sum()[lambda x: x > 0].index)
    col_names_with_empty = list(df.isnull().sum()[lambda x: x > 0].index)
    result = set(col_names_with_na) | set(col_names_with_empty)    
    print('Columns with NA or empty: {0}'.format(result))
    
def get_empty_cols(df:DataFrame):
    return list(df.isnull().sum()[lambda x: x > 0].index)

In [5]:
def reduce_mem_usage(df, verbose=True) -> pd.DataFrame:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
def split_test_train(df:DataFrame):
    test, train = df[df['ind'].eq('test')], df[df['ind'].eq('train')]
    test = test.drop(['ind'], axis=1)
    train = train.drop(['ind'], axis=1)
    return test, train
    
def combine_test_train(test:DataFrame, train:DataFrame):
    combine = pd.concat([test.assign(ind='test'), train.assign(ind='train')])
    target = train['target']
    test_ids = test['Id']
    return combine, target, test_ids

def combine_data(train_df, train_num, train_dpi, test_df, test_num, test_dpi):
    df_combine = pd.concat([train_df.assign(ind='train'), test_df.assign(ind='test')])
    df_combine_num = pd.concat([train_num.assign(ind='train'), test_num.assign(ind='test')])
    df_combine_dpi = pd.concat([train_dpi.assign(ind='train'), test_dpi.assign(ind='test')])
    return df_combine, df_combine_num, df_combine_dpi

In [7]:
def reduce_churn_data():

    if os.path.exists(_CHURN_PATH_REDUCED_) == False:
        if os.path.exists(_СHURN_PATH_) == True: 
    
            with open(_СHURN_PATH_, 'rb') as file:
                deserialized_object = pickle.load(file)

            deserialized_object = list(deserialized_object)
        
            # reduce size
            train_df= reduce_mem_usage(deserialized_object[1][1])
            train_num_reduced = reduce_mem_usage(deserialized_object[1][2])
            train_dpi_reduced = reduce_mem_usage(deserialized_object[1][3])

            test_df = reduce_mem_usage(deserialized_object[2][1])
            test_num_reduced = reduce_mem_usage(deserialized_object[2][2])
            test_dpi_reduced = reduce_mem_usage(deserialized_object[2][3])
            
            # dump data back
            deserialized_object = tuple([[train_df, train_num_reduced, train_dpi_reduced], [test_df, test_num_reduced, test_dpi_reduced]])
            pickle.dump(deserialized_object, open(_CHURN_PATH_REDUCED_, 'wb'))
    else:
        print(f'{_CHURN_PATH_REDUCED_} already exist')

In [8]:
def load_churn_data():

    with open(_СHURN_PATH_, 'rb') as file:
        deserialized_object = pickle.load(file)

    train_df = deserialized_object[1][1]
    train_num = deserialized_object[1][2]
    train_dpi = deserialized_object[1][3]

    test_df = deserialized_object[2][1]
    test_num = deserialized_object[2][2]
    test_dpi = deserialized_object[2][3]

    return train_df, train_num, train_dpi, test_df, test_num, test_dpi

In [9]:
def load_churn_reduced():

    with open(_CHURN_PATH_REDUCED_, 'rb') as file:
        deserialized_object = pickle.load(file)

    train_df = deserialized_object[0][0]
    train_num = deserialized_object[0][1]
    train_dpi = deserialized_object[0][2]

    test_df = deserialized_object[1][0]
    test_num = deserialized_object[1][1]
    test_dpi = deserialized_object[1][2]

    return train_df, train_num, train_dpi, test_df, test_num, test_dpi

In [10]:
def is_short_number(number:str) -> bool:
    if (number.isdigit() and len(number) <= 4):
        return True
    return False

def is_life(number:str) -> bool:
    if (len(number) == 12 and (number[2:5] in ['063', '093' ])):
        return True
    return False

def is_kyivstar(number:str) -> bool:
    if (len(number) == 12 and (number[2:5] in ['067', '097', '068', '098'])):
        return True
    return False

### Feature selectors

In [11]:
class FeatureHelper:
    
    def get_feature_correlation_df(corr_m, remove_duplicates=True, remove_self_correlations=True):
    
        corr_matrix_abs = corr_m.abs()
        corr_matrix_abs_us = corr_matrix_abs.unstack()
        sorted_correlated_features = corr_matrix_abs_us \
            .sort_values(kind="quicksort", ascending=False) \
            .reset_index()

        # Remove comparisons of the same feature
        if remove_self_correlations:
            sorted_correlated_features = sorted_correlated_features[
                (sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
            ]

        # Remove duplicates
        if remove_duplicates:
            sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]

        # Create meaningful names for the columns
        sorted_correlated_features.columns = ['f1', 'f2', 'corr']

        return sorted_correlated_features
    
    def get_correlation_matrix(df:DataFrame, method:str, save_path:str):
        if os.path.exists(save_path) == False:
            corr_matrix = df.corr(method = method, numeric_only = True)
            pickle.dump(corr_matrix, open(save_path, 'wb'))
        else:
            corr_matrix = pickle.load(open(save_path, 'rb'))

        return corr_matrix

    def remove_aggr_function(str_to_check:str) -> str:
        parts = str_to_check.split('_')
        
        if (len(parts) > 2):
            index_to_remove = len(parts) - 2
            
            # remove aggregation function
            if (parts[index_to_remove] in ['min', 'std', 'max', 'mea', 'td']):
                parts.remove(parts[index_to_remove])
                
            result = '_'.join(parts)
            return result
        else:
            return str_to_check    

        
    def get_heatmap_matrix(corr_matrix:DataFrame):
        heatmap_matrix = pd.DataFrame(corr_matrix['target'].abs())
        heatmap_matrix = heatmap_matrix.sort_values(by='target', ascending=False)
        heatmap_matrix = heatmap_matrix.drop(index=['target'])           
        return heatmap_matrix
    
    # index - column name
    # target - value
    def plot_heatmap(heatmap_matrix:DataFrame):
        plt.figure(figsize=(40, 120))
        heatmap = sns.heatmap(heatmap_matrix, vmin=-1, vmax=1, annot=True, cmap='BrBG')
        heatmap.set_title('Features Correlating with Churn Rate', fontdict={'fontsize':18}, pad=16);
        return heatmap_matrix
    
    def get_important_features(heatmap_matrix:DataFrame, use_groupping = False, num_of_features:int = -1):
        df_features = heatmap_matrix.reset_index()
        df_features = df_features.rename(columns = {'index':'feature'})
        
        # apply aggregation function for further groupping
        df_features['feature_group'] = df_features['feature'].apply(FeatureHelper.remove_aggr_function)
        df_features = df_features[['feature', 'feature_group', 'target']]
        sorted_features = df_features.sort_values(by=['feature_group', 'target'], ascending = [False, False])
        
        # take first item from the group
        if (use_groupping == True):
            important_features = sorted_features.groupby('feature_group').first()
        else:
            important_features = sorted_features

        # order by target
        important_features = important_features.sort_values(by='target', ascending=False)
           
        # take N first rows
        if (num_of_features != -1):
            important_features = important_features.head(num_of_features)
        
        # optimize for heatmap
        important_features = important_features.reset_index()
        important_features = important_features[['feature', 'target']]
        important_features.index = important_features['feature']
        important_features.index.name = None
        important_features = important_features[['target']]
        important_features = important_features[important_features['target'] > 0]
        
        return important_features
    
    def get_important_features_tuples(heatmap_matrix:DataFrame, num_of_features:int = -1):
        important_features = FeatureHelper.get_important_features(heatmap_matrix, num_of_features)
        
        if (num_of_features == -1):
            num_of_features = len(important_features)
        
        important_features_tuples = list(zip(important_features.index, 
                                             important_features.target, 
                                             list(range(0, num_of_features))))
        
        return important_features_tuples

In [12]:
class FeatureSelector(abc.ABC):

    @abc.abstractmethod
    def get_heatmap(self) -> pd.DataFrame:
        pass
    
    @abc.abstractmethod
    def plot_heatmap(self):
        pass

    @abc.abstractmethod
    # index - feature name, target
    def get_important_features(self) -> pd.DataFrame:
        pass
    
    @abc.abstractmethod
    # 1 - feature name, 2 - target, 3 - sorted number
    def get_important_features_tuples(self) -> List[tuple[str, float, int]]:
        pass

In [13]:
class CorrelationSelector(FeatureSelector):
    
    def __init__(self, data:pd.DataFrame, corr_method:str, num_of_features:int):
        self.data = data
        self.corr_method = corr_method
        self.file_prefix = corr_method
        self.num_of_features = num_of_features
        self.get_heatmap()
    
    def get_heatmap(self) -> pd.DataFrame:
        self.save_path = f'{self.file_prefix}_{_СORRELATION_MATRIX_PATH_}'
        self.corr_m = FeatureHelper.get_correlation_matrix(self.data, self.corr_method, self.save_path)
        self.heatmap_m = FeatureHelper.get_heatmap_matrix(self.corr_m)
        return self.heatmap_m 
    
    def get_important_features(self) -> pd.DataFrame:
        return FeatureHelper.get_important_features(self.heatmap_m, self.num_of_features)
    
    def plot_heatmap(self):
        FeatureHelper.plot_heatmap(self.get_important_features())
    
    def get_non_correlated_features(self, barrier_coef:float, do_log:bool) -> List[str]:
        
        important_tuples = FeatureHelper.get_important_features_tuples(self.heatmap_m, self.num_of_features)
        # f1, f2, corr
        features_corr = FeatureHelper.get_feature_correlation_df(self.corr_m)
        
        already_processed = set()
        all_features = [t[0] for t in important_tuples]

        for f in all_features:
            # get correlated features
            correlated = list(features_corr[(features_corr['f1']==f) & (features_corr['corr'] > barrier_coef)]['f2'])

            # if highly correlated features exist
            if (len(correlated)>0):

                for to_remove in correlated:
                    if (to_remove not in already_processed):
                        if (to_remove in all_features):
                            all_features.remove(to_remove)
                            if do_log: print(f'Removing: {to_remove} for {f}')            

        # remember initial feature
        already_processed.add(f)
        
        # return non-correlated features
        return all_features
    
    def get_important_noncorrelated_features_tuples(self, mutual_correlation:float, feature_importance:float):
        
        final = []
        important = self.get_important_features_tuples()
        non_correlated = self.get_non_correlated_features(mutual_correlation, False)
        
        for i in important:
            if (i[0] in non_correlated and i[1] > feature_importance):
                final.append(i)

        return final
                        
    def get_important_features_tuples(self) ->  List[tuple[str, float, int]]:
        return FeatureHelper.get_important_features_tuples(self.heatmap_m, self.num_of_features)
    
    def __str__(self):
        return type(self).__name__ + '_' + self.corr_method

In [14]:
# https://nitin9809.medium.com/lightgbm-binary-classification-multi-class-classification-regression-using-python-4f22032b36a2
# https://www.analyticsvidhya.com/blog/2020/10/feature-selection-techniques-in-machine-learning/
class LGBMSelector(FeatureSelector):
    
    model_file = 'LGBMSelector_simple.pcl'
    heatmap_calculated = False
    
    def __init__(self, data:pd.DataFrame, num_of_features:int):
        self.data = data
        self.num_of_features = num_of_features
        self.train_model()
    
    def train_model(self):
        test, train = split_test_train(self.data)

        self.y_train = train['target'].round(0).astype(int)
        self.y_test = test['target'].round(0).astype(int)

        train = train.drop(['target'], axis=1)
        test = test.drop(['target'], axis=1)
        
        # regressor
        if os.path.exists(self.model_file) == True: 
            with open(self.model_file, 'rb') as file:
                self.regressor = pickle.load(file)
        else:         
            self.regressor = lg.LGBMClassifier()
            self.regressor.fit(train, self.y_train)
        
        # predict
        self.predicted = self.regressor.predict(test)
        
        #save model
        pickle.dump(self.regressor, open(self.model_file, 'wb'))
    
    def get_feature_importance_raw(self):
        return self.regressor.feature_importances_
    
    def get_heatmap(self) -> pd.DataFrame:
        df_feature_importance = pd.DataFrame(list(zip(list(self.get_feature_importance_raw()), list(self.data.columns))))
        df_feature_importance = df_feature_importance.set_axis(['target', 'feature'], axis=1)
        df_feature_importance = df_feature_importance.sort_values(by=['target'], ascending=False)
        df_feature_importance = df_feature_importance.set_index('feature')
        df_feature_importance.index.name = None
        self.heatmap_m = df_feature_importance
        self.heatmap_calculated = True
        return self.heatmap_m
    
    def plot_heatmap(self):
        FeatureHelper.plot_heatmap(self.get_important_features())

    def get_important_features(self) -> pd.DataFrame:
        if (self.heatmap_calculated == False):
            self.get_heatmap()
            
        return FeatureHelper.get_important_features(self.heatmap_m, self.num_of_features)
    
    def get_important_features_tuples(self) -> List[tuple[str, float, int]]:
        if (self.heatmap_calculated == False):
            self.get_heatmap()
            
        features_tuples = FeatureHelper.get_important_features_tuples(self.heatmap_m, self.num_of_features)
        return features_tuples
        
    def get_ROCAUC(self):
        return Metrics.roc_auc(self.y_test, self.predicted)

    def get_confusion_matrix(self):
        return Metrics.confusion_matrix(self.y_test, self.predicted)
    
    def get_classification_report(self):
        return Metrics.classification_report(self.y_test, self.predicted)
    
    def __str__(self):
        return type(self).__name__

In [15]:
# https://neptune.ai/blog/lightgbm-parameters-guide
class EnhancedLGBMSelector(LGBMSelector):
    
    model_file = 'LGBM_model_800_features.pcl'
    
    def get_feature_importance_raw(self):
        return self.regressor.feature_importance()
    
    def train_model(self):        
        
        test, train = split_test_train(df_combine)

        self.y_train = train['target'].round(0).astype(int)
        self.y_test = test['target'].round(0).astype(int)

        x_train = train.drop(['target'], axis=1)
        x_test = test.drop(['target'], axis=1)

        # Specifying the parameter
        d_train = lg.Dataset(x_train, label=self.y_train)
        d_test = lg.Dataset(x_test, label=self.y_test, reference=d_train)

        # load model from disk
        if os.path.exists(self.model_file) == True: 
            with open(self.model_file, 'rb') as file:
                self.regressor = pickle.load(file)
        # Train model
        else:
            params={}
            params['boosting_type']='dart' 
            params['objective']='binary' 
            params['metric']='auc' 
            params['verbosity'] = 2

            #train the model 
            self.regressor=lg.train(params = params, 
                          train_set = d_train,
                          valid_sets = d_test, 
                          num_boost_round = 150, 
                          callbacks= [lg.early_stopping(stopping_rounds=20)])
        
        # make prediction
        self.predicted = self.regressor.predict(x_test)
        
        #save model
        pickle.dump(self.regressor, open(self.model_file, 'wb'))


In [16]:
class IterativeLGBMSelector(FeatureSelector):
    
    model_file = 'IterativeLGBMSelector.pcl'
    
    def __init__(self, 
                 data:pd.DataFrame, 
                 selector:EnhancedLGBMSelector,
                 base_line_auc:float,
                 do_log:bool):
        
        self.data = data
        self.trained_selector = selector
        self.base_line_auc = base_line_auc
        self.do_log = do_log
        
        # check pre-saved results
        if os.path.exists(self.model_file) == True:
            with open(self.model_file, 'rb') as file:
                self.result = pickle.load(file)
        else:
            self.result = self.train_model()
       
    def train_model(self):
        
        all_features = [f[0] for f in self.trained_selector.get_important_features_tuples()]
        do_log = True
        existing = ['target', 'ind']
        roc_auc = 0
        prev_roc_auc = 0
        increase_rate = 0
        result = []

        # for every column
        for c in list(all_features):
            if c != 'target':

                # add column
                existing.append(c)

                # new dataset
                df = df_combine[existing]

                # build model
                test, train = split_test_train(df)
                y_train = train['target'].round(0).astype(int)
                y_test = test['target'].round(0).astype(int)
                x_train = train.drop(['target'], axis=1)
                x_test = test.drop(['target'], axis=1)

                # create datasets
                d_train = lg.Dataset(x_train, label=y_train)
                d_test = lg.Dataset(x_test, label=y_test, reference=d_train)

                params={}
                params['boosting_type']='dart' 
                params['objective']='binary' 
                params['metric']='auc' 
                params['verbosity'] = 0

                #train the model 
                regressor=lg.train(params = params, 
                              train_set = d_train,
                              valid_sets = d_test, 
                              num_boost_round = 100)

                # calculate metric
                predicted = regressor.predict(x_test)

                try:
                    roc_auc = Metrics.roc_auc(y_test, predicted)
                except ValueError:
                    roc_auc = -1

                # save result
                diff = roc_auc - prev_roc_auc
                item = (len(existing), existing, roc_auc, diff)
                result.append(item)
                if self.do_log == True: print(f'======== {len(existing)} -> AUC: {roc_auc} -> DIFF: {diff}' )

                if (roc_auc > self.base_line_auc):
                    if self.do_log == True: print(f'======== {roc_auc} increased base line threshold with {len(existing) -2} features {existing}')
                    break

                # handle negative impact
                if (diff < -0.01):
                    if self.do_log == True: print(f'======== Feature {c} gives negative impact of {diff}. Removing it.')
                    existing.remove(c)
                else:
                    prev_roc_auc = roc_auc
        
        # save result
        pickle.dump(result, open(self.model_file, 'wb'))
        
        return result
    
    def get_heatmap(self) -> pd.DataFrame:
        pass
    
    def plot_heatmap(self):
        pass
    
    def get_important_features(self) -> pd.DataFrame:
        pass
    
    def get_important_features_tuples(self) -> List[tuple[str, float, int]]:
        return self.result[-1][1][2:]

## Data Transformation

In [17]:
class TransformPipe:
    
    def __init__(self, funcs, **kwargs):
        self.funcs = funcs
        self.kwargs = kwargs
    
    def transform(self, df:DataFrame) -> DataFrame:
        for f in self.funcs:
            df = f(df, **self.kwargs)
            
        return df

## Load Data

In [18]:
train_df, train_num, train_dpi, test_df, test_num, test_dpi = load_churn_reduced()
df_combine, df_combine_num, df_combine_dpi = combine_data(train_df, train_num, train_dpi, test_df, test_num, test_dpi)

### Feature importance

In [19]:
# https://towardsdatascience.com/deep-dive-on-ml-techniques-for-feature-selection-in-python-part-2-c258f8a2ac43
# https://www.kaggle.com/code/gomes555/tps-jun2021-feature-selection-lightgbm-tuner

pearsonSelector = CorrelationSelector(df_combine, 'pearson', -1)
spearmanSelector = CorrelationSelector(df_combine, 'spearman', -1) 

In [20]:
pearson_non_corr = pearsonSelector.get_important_noncorrelated_features_tuples(0.95, 0.1)
spearman_non_corr = spearmanSelector.get_important_noncorrelated_features_tuples(0.95, 0.1)

In [21]:
enhanced = EnhancedLGBMSelector(df_combine, -1)
normal = LGBMSelector(df_combine, -1)

In [22]:
normal.get_ROCAUC(), enhanced.get_ROCAUC()

(0.6585018863563808, 0.8964107384015783)

In [23]:
iterative = IterativeLGBMSelector(df_combine, enhanced, 0.89, True)

You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.647724
[2]	valid_0's auc: 0.647724
[3]	valid_0's auc: 0.647724
[4]	valid_0's auc: 0.647724
[5]	valid_0's auc: 0.647724
[6]	valid_0's auc: 0.647724
[7]	valid_0's auc: 0.647724
[8]	valid_0's auc: 0.647724
[9]	valid_0's auc: 0.647724
[10]	valid_0's auc: 0.647724
[11]	valid_0's auc: 0.647724
[12]	valid_0's auc: 0.647724
[13]	valid_0's auc: 0.647724
[14]	valid_0's auc: 0.647724
[15]	valid_0's auc: 0.647724
[16]	valid_0's auc: 0.647724
[17]	valid_0's auc: 0.647724
[18]	valid_0's auc: 0.647724
[19]	valid_0's auc: 0.647724
[20]	valid_0's auc: 0.647724
[21]	valid_0's auc: 0.647724
[22]	valid_0's auc: 0.647724
[23]	valid_0's auc: 0.647724
[24]	valid_0's auc: 0.647724
[25]	valid_0's auc: 0.647724
[26]	valid_0's auc: 0.647724
[27]	valid_0's auc: 0.647724
[28]	valid_0's auc: 0.647724
[29]	valid_0's auc: 0.647724
[30]	valid_0's auc: 0.647724
[31]	valid_0's auc: 0.647724
[32]	valid_0's auc: 0.647724
[33]	valid_0's auc: 0.6

[81]	valid_0's auc: 0.647724
[82]	valid_0's auc: 0.647724
[83]	valid_0's auc: 0.647724
[84]	valid_0's auc: 0.647724
[85]	valid_0's auc: 0.647724
[86]	valid_0's auc: 0.647724
[87]	valid_0's auc: 0.647724
[88]	valid_0's auc: 0.647724
[89]	valid_0's auc: 0.647724
[90]	valid_0's auc: 0.647724
[91]	valid_0's auc: 0.647724
[92]	valid_0's auc: 0.647724
[93]	valid_0's auc: 0.647724
[94]	valid_0's auc: 0.647724
[95]	valid_0's auc: 0.647724
[96]	valid_0's auc: 0.647724
[97]	valid_0's auc: 0.647724
[98]	valid_0's auc: 0.647724
[99]	valid_0's auc: 0.647724
[100]	valid_0's auc: 0.647724
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.775195
[2]	valid_0's auc: 0.775653
[3]	valid_0's auc: 0.776778
[4]	valid_0's auc: 0.77802
[5]	valid_0's auc: 0.778145
[6]	valid_0's auc: 0.778398
[7]	valid_0's auc: 0.778502
[8]	valid_0's auc: 0.778555
[9]	valid_0's auc: 0.778656
[10]	valid_0's auc: 0.778531
[11]	valid_0's auc: 0.778598
[12]	valid_0's auc: 0.778632
[13]	valid_0's auc: 0.7

[95]	valid_0's auc: 0.806289
[96]	valid_0's auc: 0.806283
[97]	valid_0's auc: 0.806292
[98]	valid_0's auc: 0.806289
[99]	valid_0's auc: 0.806266
[100]	valid_0's auc: 0.806297
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.838439
[2]	valid_0's auc: 0.846457
[3]	valid_0's auc: 0.850808
[4]	valid_0's auc: 0.853257
[5]	valid_0's auc: 0.854994
[6]	valid_0's auc: 0.855308
[7]	valid_0's auc: 0.856254
[8]	valid_0's auc: 0.856448
[9]	valid_0's auc: 0.85738
[10]	valid_0's auc: 0.857741
[11]	valid_0's auc: 0.85818
[12]	valid_0's auc: 0.858087
[13]	valid_0's auc: 0.85841
[14]	valid_0's auc: 0.862197
[15]	valid_0's auc: 0.862333
[16]	valid_0's auc: 0.863252
[17]	valid_0's auc: 0.864056
[18]	valid_0's auc: 0.865085
[19]	valid_0's auc: 0.86593
[20]	valid_0's auc: 0.866289
[21]	valid_0's auc: 0.866245
[22]	valid_0's auc: 0.866565
[23]	valid_0's auc: 0.866968
[24]	valid_0's auc: 0.867266
[25]	valid_0's auc: 0.867413
[26]	valid_0's auc: 0.86755
[27]	valid_0's auc: 0.86789

[56]	valid_0's auc: 0.874296
[57]	valid_0's auc: 0.874298
[58]	valid_0's auc: 0.874361
[59]	valid_0's auc: 0.874356
[60]	valid_0's auc: 0.874397
[61]	valid_0's auc: 0.87448
[62]	valid_0's auc: 0.874458
[63]	valid_0's auc: 0.874443
[64]	valid_0's auc: 0.874433
[65]	valid_0's auc: 0.874385
[66]	valid_0's auc: 0.874476
[67]	valid_0's auc: 0.874433
[68]	valid_0's auc: 0.874416
[69]	valid_0's auc: 0.874411
[70]	valid_0's auc: 0.874348
[71]	valid_0's auc: 0.874273
[72]	valid_0's auc: 0.874327
[73]	valid_0's auc: 0.874308
[74]	valid_0's auc: 0.874282
[75]	valid_0's auc: 0.874323
[76]	valid_0's auc: 0.874326
[77]	valid_0's auc: 0.874291
[78]	valid_0's auc: 0.874319
[79]	valid_0's auc: 0.874281
[80]	valid_0's auc: 0.874318
[81]	valid_0's auc: 0.874315
[82]	valid_0's auc: 0.874236
[83]	valid_0's auc: 0.874226
[84]	valid_0's auc: 0.87423
[85]	valid_0's auc: 0.87423
[86]	valid_0's auc: 0.874234
[87]	valid_0's auc: 0.874282
[88]	valid_0's auc: 0.874294
[89]	valid_0's auc: 0.874292
[90]	valid_0's au

[18]	valid_0's auc: 0.873512
[19]	valid_0's auc: 0.876363
[20]	valid_0's auc: 0.876933
[21]	valid_0's auc: 0.876975
[22]	valid_0's auc: 0.878034
[23]	valid_0's auc: 0.878264
[24]	valid_0's auc: 0.87852
[25]	valid_0's auc: 0.878798
[26]	valid_0's auc: 0.879124
[27]	valid_0's auc: 0.87948
[28]	valid_0's auc: 0.879561
[29]	valid_0's auc: 0.879631
[30]	valid_0's auc: 0.87979
[31]	valid_0's auc: 0.879767
[32]	valid_0's auc: 0.879819
[33]	valid_0's auc: 0.879829
[34]	valid_0's auc: 0.879975
[35]	valid_0's auc: 0.879909
[36]	valid_0's auc: 0.87994
[37]	valid_0's auc: 0.880001
[38]	valid_0's auc: 0.880179
[39]	valid_0's auc: 0.880356
[40]	valid_0's auc: 0.880488
[41]	valid_0's auc: 0.880486
[42]	valid_0's auc: 0.880661
[43]	valid_0's auc: 0.880663
[44]	valid_0's auc: 0.880936
[45]	valid_0's auc: 0.880942
[46]	valid_0's auc: 0.881005
[47]	valid_0's auc: 0.881025
[48]	valid_0's auc: 0.881118
[49]	valid_0's auc: 0.881134
[50]	valid_0's auc: 0.881126
[51]	valid_0's auc: 0.881218
[52]	valid_0's auc

[85]	valid_0's auc: 0.8864
[86]	valid_0's auc: 0.886379
[87]	valid_0's auc: 0.886382
[88]	valid_0's auc: 0.886367
[89]	valid_0's auc: 0.886354
[90]	valid_0's auc: 0.886325
[91]	valid_0's auc: 0.886387
[92]	valid_0's auc: 0.886431
[93]	valid_0's auc: 0.886406
[94]	valid_0's auc: 0.886375
[95]	valid_0's auc: 0.886378
[96]	valid_0's auc: 0.886348
[97]	valid_0's auc: 0.886281
[98]	valid_0's auc: 0.886299
[99]	valid_0's auc: 0.886351
[100]	valid_0's auc: 0.886386
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's auc: 0.839576
[2]	valid_0's auc: 0.852093
[3]	valid_0's auc: 0.858109
[4]	valid_0's auc: 0.85845
[5]	valid_0's auc: 0.862928
[6]	valid_0's auc: 0.863609
[7]	valid_0's auc: 0.86598
[8]	valid_0's auc: 0.866173
[9]	valid_0's auc: 0.867711
[10]	valid_0's auc: 0.868308
[11]	valid_0's auc: 0.869916
[12]	valid_0's auc: 0.869906
[13]	valid_0's auc: 0.870825
[14]	valid_0's auc: 0.871281
[15]	valid_0's auc: 

[41]	valid_0's auc: 0.885843
[42]	valid_0's auc: 0.886003
[43]	valid_0's auc: 0.885924
[44]	valid_0's auc: 0.886007
[45]	valid_0's auc: 0.886011
[46]	valid_0's auc: 0.886163
[47]	valid_0's auc: 0.886167
[48]	valid_0's auc: 0.886167
[49]	valid_0's auc: 0.886092
[50]	valid_0's auc: 0.886076
[51]	valid_0's auc: 0.886088
[52]	valid_0's auc: 0.88629
[53]	valid_0's auc: 0.886337
[54]	valid_0's auc: 0.886441
[55]	valid_0's auc: 0.886479
[56]	valid_0's auc: 0.886443
[57]	valid_0's auc: 0.886556
[58]	valid_0's auc: 0.886625
[59]	valid_0's auc: 0.886666
[60]	valid_0's auc: 0.886705
[61]	valid_0's auc: 0.886794
[62]	valid_0's auc: 0.886781
[63]	valid_0's auc: 0.886789
[64]	valid_0's auc: 0.886826
[65]	valid_0's auc: 0.88675
[66]	valid_0's auc: 0.886676
[67]	valid_0's auc: 0.886629
[68]	valid_0's auc: 0.886674
[69]	valid_0's auc: 0.886613
[70]	valid_0's auc: 0.886602
[71]	valid_0's auc: 0.886599
[72]	valid_0's auc: 0.886642
[73]	valid_0's auc: 0.886825
[74]	valid_0's auc: 0.88676
[75]	valid_0's au

[3]	valid_0's auc: 0.854807
[4]	valid_0's auc: 0.856069
[5]	valid_0's auc: 0.857521
[6]	valid_0's auc: 0.863791
[7]	valid_0's auc: 0.866639
[8]	valid_0's auc: 0.868665
[9]	valid_0's auc: 0.869892
[10]	valid_0's auc: 0.869994
[11]	valid_0's auc: 0.870753
[12]	valid_0's auc: 0.870582
[13]	valid_0's auc: 0.871179
[14]	valid_0's auc: 0.872627
[15]	valid_0's auc: 0.87452
[16]	valid_0's auc: 0.875126
[17]	valid_0's auc: 0.875886
[18]	valid_0's auc: 0.876424
[19]	valid_0's auc: 0.879609
[20]	valid_0's auc: 0.880719
[21]	valid_0's auc: 0.880707
[22]	valid_0's auc: 0.881746
[23]	valid_0's auc: 0.882055
[24]	valid_0's auc: 0.882822
[25]	valid_0's auc: 0.883273
[26]	valid_0's auc: 0.883688
[27]	valid_0's auc: 0.883931
[28]	valid_0's auc: 0.883915
[29]	valid_0's auc: 0.884217
[30]	valid_0's auc: 0.884509
[31]	valid_0's auc: 0.884453
[32]	valid_0's auc: 0.884741
[33]	valid_0's auc: 0.88496
[34]	valid_0's auc: 0.885139
[35]	valid_0's auc: 0.885062
[36]	valid_0's auc: 0.885061
[37]	valid_0's auc: 0.8

[71]	valid_0's auc: 0.888024
[72]	valid_0's auc: 0.887963
[73]	valid_0's auc: 0.888238
[74]	valid_0's auc: 0.888152
[75]	valid_0's auc: 0.888247
[76]	valid_0's auc: 0.888301
[77]	valid_0's auc: 0.888299
[78]	valid_0's auc: 0.888351
[79]	valid_0's auc: 0.888322
[80]	valid_0's auc: 0.888482
[81]	valid_0's auc: 0.888483
[82]	valid_0's auc: 0.888678
[83]	valid_0's auc: 0.888662
[84]	valid_0's auc: 0.888603
[85]	valid_0's auc: 0.888582
[86]	valid_0's auc: 0.888765
[87]	valid_0's auc: 0.888796
[88]	valid_0's auc: 0.888742
[89]	valid_0's auc: 0.888716
[90]	valid_0's auc: 0.888665
[91]	valid_0's auc: 0.888688
[92]	valid_0's auc: 0.888708
[93]	valid_0's auc: 0.888645
[94]	valid_0's auc: 0.888611
[95]	valid_0's auc: 0.888607
[96]	valid_0's auc: 0.888591
[97]	valid_0's auc: 0.888552
[98]	valid_0's auc: 0.888562
[99]	valid_0's auc: 0.888525
[100]	valid_0's auc: 0.888519
You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.841004
[2]	valid_0's auc: 0.848598
[3]	valid_0's a

[32]	valid_0's auc: 0.885989
[33]	valid_0's auc: 0.886353
[34]	valid_0's auc: 0.886689
[35]	valid_0's auc: 0.886611
[36]	valid_0's auc: 0.886594
[37]	valid_0's auc: 0.886962
[38]	valid_0's auc: 0.887392
[39]	valid_0's auc: 0.887588
[40]	valid_0's auc: 0.887749
[41]	valid_0's auc: 0.887741
[42]	valid_0's auc: 0.888065
[43]	valid_0's auc: 0.887974
[44]	valid_0's auc: 0.88814
[45]	valid_0's auc: 0.888168
[46]	valid_0's auc: 0.888276
[47]	valid_0's auc: 0.888463
[48]	valid_0's auc: 0.888595
[49]	valid_0's auc: 0.888479
[50]	valid_0's auc: 0.888508
[51]	valid_0's auc: 0.888751
[52]	valid_0's auc: 0.888892
[53]	valid_0's auc: 0.888968
[54]	valid_0's auc: 0.889052
[55]	valid_0's auc: 0.889055
[56]	valid_0's auc: 0.889028
[57]	valid_0's auc: 0.889213
[58]	valid_0's auc: 0.889336
[59]	valid_0's auc: 0.88937
[60]	valid_0's auc: 0.889317
[61]	valid_0's auc: 0.889429
[62]	valid_0's auc: 0.889473
[63]	valid_0's auc: 0.889503
[64]	valid_0's auc: 0.8895
[65]	valid_0's auc: 0.889427
[66]	valid_0's auc

In [24]:
enhanced.get_ROCAUC(), normal.get_ROCAUC()

(0.8964107384015783, 0.6585018863563808)

In [25]:
len(enhanced.get_important_features_tuples())

495

In [26]:
df_test = pd.DataFrame(enhanced.y_test)

In [27]:
len(df_test[df_test['target'] == 1].index), len(df_test[df_test['target'] == 0].index), 

(9403, 140597)

In [28]:
postive_ind = df_test[df_test['target'] == 1].index
negative_ind = df_test[df_test['target'] == 0].index

In [29]:
np.mean(enhanced.predicted[list(postive_ind)])

0.34545770931987535

In [30]:
np.mean(enhanced.predicted[list(negative_ind)])

0.055552250396799674

In [31]:
X = [r[0] for r in result]

NameError: name 'result' is not defined

In [None]:
Y = [r[2] for r in result]

In [None]:
plt.plot(X,Y,label="AUC")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
Y

In [None]:
enhanced.get_important_features_tuples()

### Check if dataset is balanced

In [None]:
churned = len(df_combine[df_combine['target'] == 1])
not_churned = len(df_combine[df_combine['target'] == 0])

In [None]:
y = [churned, not_churned]
x = ['Churned', 'Not churned']
plt.bar(x,y)
plt.show()

## Feature engineering
#### Check inbound calls from non-vodaphone number
#### Check outbound calls to non-vodaphone number
#### SMS from non-vodaphone number
#### SMS to non-vodaphone number

## Correlation matrix

## Explore numbers abonent had communication with + frequency

In [None]:
churned = df_combine[df_combine['target'] == 1]
churned_with_nums = pd.merge(churned, df_combine_num, on='abon_id', how='left')
churned_numbers = list(churned_with_nums['bnum'].unique())

non_churned = df_combine[df_combine['target'] == 0]
non_churned_with_nums = pd.merge(non_churned, df_combine_num, on='abon_id', how='left')
non_churned_numbers = list(non_churned_with_nums['bnum'].unique())

number_abon_had_communicated = (set(churned_numbers) - set(non_churned_numbers))
df_number_abon_had_communicated = pd.DataFrame(number_abon_had_communicated, columns= ['bnum'])

In [None]:
churned_with_nums[churned_with_nums['bnum'].isin(list(number_abon_had_communicated))]

In [None]:
churned_with_nums

## Telephone Feature

## Groupping

In [None]:
gr = train_num.groupby(['abon_id'])
gr.groups

In [None]:
# cor_matrix = train_df.corr()