In [1]:
import numpy as np
import pandas as pd
import pickle
import time
from random import randint

from sklearn import metrics, preprocessing, model_selection
from tabulate import tabulate

import catboost as cat
import lightgbm as lgb

np.random.seed(43)

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
train.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,dcf68cc2fb515ccad7d8b9b3bd80ee2a4b270063,SAINT-LOUIS,K > 24 month,17000.0,32.0,18000.0,6000.0,34.0,,97.0,355.0,6.0,,,NO,62,All-net 500F=2000F;5d,35.0,0
1,71c44b5ba328db5c4192a80f7cf8f244d9350ed0,,K > 24 month,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,,2.0,NO,40,"Data: 100 F=40MB,24H",22.0,0
2,ce46411b1526c94f20a383b8cb188f8d27f82a0a,TAMBACOUNDA,K > 24 month,1500.0,3.0,1500.0,500.0,3.0,,30.0,30.0,,,,NO,32,All-net 500F=2000F;5d,3.0,0
3,f467cdb6669818373c26c2bad44e01ba66f97d21,FATICK,K > 24 month,1500.0,3.0,2497.0,832.0,4.0,0.0,159.0,45.0,19.0,,,NO,18,On net 200F=Unlimited _call24H,3.0,0
4,ec45e1a1888a32b5dcce0954cfec20c6e037db31,FATICK,K > 24 month,,,498.0,166.0,3.0,1.0,1.0,3.0,,,,NO,50,,,0


In [4]:
train.isna().sum()

user_id                0
REGION            157520
TENURE                 0
MONTANT           140277
FREQUENCE_RECH    140277
REVENUE           134663
ARPU_SEGMENT      134663
FREQUENCE         134663
DATA_VOLUME       196854
ON_NET            145819
ORANGE            166317
TIGO              239386
ZONE1             368310
ZONE2             374487
MRG                    0
REGULARITY             0
TOP_PACK          167329
FREQ_TOP_PACK     167329
CHURN                  0
dtype: int64

In [5]:
class Preprocess():
    def __init__(self, data):
        self.data = data
        self.cat_col = ['REGION', 'TOP_PACK']
    def fillna(self):
        for col in self.data.columns:
            if col in self.cat_col:
                self.data[col].fillna('N/A', inplace=True)
            else:
                self.data[col].fillna(-1, inplace=True)
#     def encode(self):
#         for col in self.cat_col:
#             self.data[col] = pd.factorize(self.data[col])[0]
#     def mapper(self, col):
#         ten_map = { "K > 24 month": 24, 
#           "I 18-21 month": 18, 
#           "G 12-15 month": 12, 
#           "H 15-18 month": 15, 
#           "J 21-24 month": 21, 
#           "F 9-12 month": 9, 
#           "D 3-6 month": 3, 
#           "E 6-9 month": 6 }
#         self.data[col].replace(ten_map, inplace=True)
    def fit(self):
        self.data.drop(['user_id', 'TIGO', 'ZONE1', 'ZONE2', 'MRG'], inplace=True, axis=1)
#         self.mapper('TENURE')
        self.fillna()
#         self.encode()
        return self.data

In [6]:
train_preprocess = Preprocess(train)
train = train_preprocess.fit()
train.isna().sum()

REGION            0
TENURE            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
REGULARITY        0
TOP_PACK          0
FREQ_TOP_PACK     0
CHURN             0
dtype: int64

In [7]:
train.head()

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,SAINT-LOUIS,K > 24 month,17000.0,32.0,18000.0,6000.0,34.0,-1.0,97.0,355.0,62,All-net 500F=2000F;5d,35.0,0
1,,K > 24 month,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,40,"Data: 100 F=40MB,24H",22.0,0
2,TAMBACOUNDA,K > 24 month,1500.0,3.0,1500.0,500.0,3.0,-1.0,30.0,30.0,32,All-net 500F=2000F;5d,3.0,0
3,FATICK,K > 24 month,1500.0,3.0,2497.0,832.0,4.0,0.0,159.0,45.0,18,On net 200F=Unlimited _call24H,3.0,0
4,FATICK,K > 24 month,-1.0,-1.0,498.0,166.0,3.0,1.0,1.0,3.0,50,,-1.0,0


In [8]:
train.dtypes

REGION             object
TENURE             object
MONTANT           float64
FREQUENCE_RECH    float64
REVENUE           float64
ARPU_SEGMENT      float64
FREQUENCE         float64
DATA_VOLUME       float64
ON_NET            float64
ORANGE            float64
REGULARITY          int64
TOP_PACK           object
FREQ_TOP_PACK     float64
CHURN               int64
dtype: object

In [9]:
cat_feat = np.where(train.dtypes == np.object)[0]
cat_feat

array([ 0,  1, 11], dtype=int64)

In [10]:
X = train.drop(['CHURN'], axis=1)
y = train.CHURN
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, test_size=0.2, random_state=43)

In [11]:
class Estimator():
    def __init__(self, clf, name, x_train, y_train, x_test, y_test, VB=100, ESR=100):
        self.model = clf
        self.model_name = name
        self.x_tr = x_train
        self.y_tr = y_train
        self.x_ts = x_test
        self.y_ts = y_test
        self.vb = VB
        self.ESR = ESR
        
    def run(self):
        self.train()
        table_score = self.score()
        print(table_score)
        self.save_model()
        return self.model
        
    def train(self):
        print(f'[INFO] - TRAINING {self.model_name}\n')
        start = time.time() # training start time
        self.model.fit(self.x_tr, self.y_tr, eval_set=[(self.x_tr, self.y_tr), (self.x_ts, self.y_ts)],
                       verbose=100, early_stopping_rounds=100)
        self.tr_end = time.ctime(time.time() - start).split()[3] # training end time
        
    def score(self):
        start = time.time() # Prediction strart time
        self.tr_preds=self.model.predict_proba(self.x_tr)[:,1]
        self.ts_preds=self.model.predict_proba(self.x_ts)[:,1]
        # Prediction end time
        self.pr_end = time.ctime(time.time() - start).split()[3]
        
        scores = {
            'Train ACC': [],
            'Test ACC': [],
            'Train AUC': [],
            'Test AUC': [],
            'Train Logloss': [],
            'Test Logloss': []
        }
        
        tr_acc = self.model.score(self.x_tr, self.y_tr)
        ts_acc = self.model.score(self.x_ts, self.y_ts)
        tr_auc = metrics.roc_auc_score(self.y_ts, self.ts_preds)
        ts_auc = metrics.roc_auc_score(self.y_tr, self.tr_preds)
        tr_lls = metrics.log_loss(self.y_tr, self.tr_preds)
        ts_lls = metrics.log_loss(self.y_ts, self.ts_preds)
        
        scores['Train ACC'] = [tr_acc]
        scores['Test ACC'] = [ts_acc]
        scores['Train AUC'] = [tr_auc]
        scores['Test AUC'] = [ts_auc]
        scores['Train Logloss'] = [tr_lls]
        scores['Test Logloss'] = [ts_lls]
        scores['Training Time'] = [self.tr_end]
        scores['Prediction Time'] = [self.pr_end]
        
        table_score = tabulate(scores, headers='keys', tablefmt='fancy_grid')
        return table_score
    
    def save_model(self):
        print('[INFO] - Saving model...')
        filename = self.model_name.split(' ')[0] + '_' + hex(randint(0, 255))
        pickle.dump(self.model, open(filename, 'wb'))
        print(f"saved '{filename}'.")

In [12]:
cat_clf = cat.CatBoostClassifier(n_estimators=1000, bootstrap_type='MVS', 
                                 eval_metric='Logloss', max_depth=9,
                                 cat_features=cat_feat, random_seed= 23, use_best_model=True)
name = 'CATBOOST CLASSIFIER'
estimator = Estimator(cat_clf, name, x_train, y_train, x_test, y_test)
model = estimator.run()

[INFO] - TRAINING CATBOOST CLASSIFIER

Learning rate set to 0.131729
0:	learn: 0.5053694	test: 0.5053456	test1: 0.5065633	best: 0.5065633 (0)	total: 1.72s	remaining: 28m 33s
100:	learn: 0.2477841	test: 0.2483042	test1: 0.2543671	best: 0.2543477 (97)	total: 2m 9s	remaining: 19m 8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2543476648
bestIteration = 97

Shrink model to first 98 iterations.
╒═════════════╤════════════╤═════════════╤════════════╤═════════════════╤════════════════╤═════════════════╤═══════════════════╕
│   Train ACC │   Test ACC │   Train AUC │   Test AUC │   Train Logloss │   Test Logloss │ Training Time   │ Prediction Time   │
╞═════════════╪════════════╪═════════════╪════════════╪═════════════════╪════════════════╪═════════════════╪═══════════════════╡
│    0.881353 │   0.876363 │    0.929277 │   0.932936 │        0.248382 │       0.254348 │ 16:04:28        │ 16:00:01          │
╘═════════════╧════════════╧═════════════╧════════════╧═══════════

In [13]:
test_preprocess = Preprocess(test)
test = test_preprocess.fit()
test.isna().sum()

REGION            0
TENURE            0
MONTANT           0
FREQUENCE_RECH    0
REVENUE           0
ARPU_SEGMENT      0
FREQUENCE         0
DATA_VOLUME       0
ON_NET            0
ORANGE            0
REGULARITY        0
TOP_PACK          0
FREQ_TOP_PACK     0
dtype: int64

In [18]:
pred = model.predict_proba(test)[:, 1]
pred

array([0.78250634, 0.67727455, 0.14465804, ..., 0.02012456, 0.01097813,
       0.04647731])

In [17]:
loaded_model = pickle.load(open('CATBOOST_0xcb', 'rb'))
pred = loaded_model.predict_proba(test)[:, 1]
pred

array([0.78250634, 0.67727455, 0.14465804, ..., 0.02012456, 0.01097813,
       0.04647731])

In [12]:
lgb_clf = lgb.LGBMClassifier(n_estimators=1000, random_seed= 10)
name = 'LIGHTGBM CLASSIFIER'
estimator = Estimator(lgb_clf, name, x_train, y_train, x_test, y_test)
estimator.run()

[INFO] - TRAINING LIGHTGBM CLASSIFIER





[100]	training's binary_logloss: 0.248523	valid_1's binary_logloss: 0.257336
╒═════════════╤════════════╤═════════════╤════════════╤═════════════════╤════════════════╤═════════════════╤═══════════════════╕
│   Train ACC │   Test ACC │   Train AUC │   Test AUC │   Train Logloss │   Test Logloss │ Training Time   │ Prediction Time   │
╞═════════════╪════════════╪═════════════╪════════════╪═════════════════╪════════════════╪═════════════════╪═══════════════════╡
│    0.880169 │     0.8762 │    0.929052 │   0.932496 │        0.249183 │       0.254824 │ 16:00:13        │ 16:00:01          │
╘═════════════╧════════════╧═════════════╧════════════╧═════════════════╧════════════════╧═════════════════╧═══════════════════╛
[INFO] - Saving model...
saved 'LIGHTGBM_0xff'.


In [30]:
columns = ['REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME',
                   'ON_NET', 'ORANGE', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK'] 
values = [[1, 2, 3, 4, 6, 6, 8, 5, 4, 37, 4, 9, 0]]

features = pd.DataFrame(values, columns=columns)
features

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,REGULARITY,TOP_PACK,FREQ_TOP_PACK
0,1,2,3,4,6,6,8,5,4,37,4,9,0
