In [139]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import scipy.stats as stats
import statsmodels.stats.api as sms

from IPython.display import display
from IPython.display import Markdown as md
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 4)

def f():
    pd.set_option('display.max_rows', None)

def nf():
    pd.set_option('display.max_rows', 4)

    
import warnings
warnings.filterwarnings('ignore')

def dp(df, r = 5, c = None):
    from IPython.display import display
    with pd.option_context('display.max_rows', 4, 'display.max_columns', None):
        display(df)

def fg(w = 6, h = 4, dpi = 120):
    plt.rcParams['figure.figsize'] = (w,h)
    plt.rcParams['figure.dpi'] = dpi
fg()

# Carregar dados

In [19]:
data = pd.read_csv('data/data.csv', sep = ';')

In [34]:
data.dtypes

id          int64
periodo     int64
dummy      object
comprou     int64
dtype: object

In [35]:
dummy = pd.get_dummies(data)

In [36]:
dummy

Unnamed: 0,id,periodo,comprou,dummy_a,dummy_b,dummy_c
0,804591,1,1,1,0,0
1,262974,1,0,0,1,0
...,...,...,...,...,...,...
39,476296,6,0,0,0,1
40,508304,6,1,0,1,0


In [21]:
from sklearn.model_selection import train_test_split

# Primeiro a separação de periodo

In [37]:
in_ = dummy[dummy['periodo']<4]

In [38]:
out_ = dummy[dummy['periodo']>3]

In [39]:
Xin = in_.drop('comprou', axis = 1)

In [40]:
yin = in_['comprou']

### Treinamento com o primeiro período


In [41]:
X_train, X_test, y_train, y_test = train_test_split(Xin, yin, test_size = 0.3)

### Chamar modelo

In [42]:
from xgboost import XGBClassifier

xgboost = XGBClassifier()

In [74]:
class ml():
    
    '''classe para pipeline de ml'''
    def __init__(self, X_train, X_test, y_train, y_test, model, modelname, threshold = 0.5):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model
        self.modelname = modelname
        self.threshold = threshold
        
        self.fit()
        
    def fit(self):
        self.model.fit(self.X_train, self.y_train)
        
    def probs(self, new_X_test = None):
        
        self.new_X_test = new_X_test
        
        '''Predict proba'''
        if new_X_test is None:
            return self.model.predict_proba(self.X_test)[:,1]
        else:
            return self.model.predict_proba(self.new_X_test)[:,1]
        
    def pred(self):
        import numpy as np
        return np.where(self.probs() > self.threshold, 1, 0)

    def matriz(self):
        from confusionmatrix import matriz_confusao as cm
        cm = cm(self.y_test, self.pred())
        return cm

    def roccurve(self):
        '''Curva roc'''
        from sklearn.metrics import roc_curve
        import matplotlib.pyplot as plt
        # Gerar os dados da diagonal (no skill classifier)
        
        ns_probs = [0 for item in range(len(self.y_test))]
        ns_fpr, ns_tpr, ns_thres = roc_curve(self.y_test, ns_probs)

        #Probabilidades da classe positiva
        fpr, tpr, thresholds = roc_curve(self.y_test, self.probs())
        
        plt.plot(fpr, tpr, marker = '.', label = self.modelname)
        plt.plot(ns_fpr, ns_tpr, linestyle = '--', label = 'Classificador base')
        plt.xlabel('Razão de Falsos Positivos')
        plt.ylabel('Razão de Verdadeiros Positivos')
        plt.title('ROC Curve')
        plt.legend()
        plt.show()

    def precisionrecall(self):
        '''Precision Recall Curve'''
        import matplotlib.pyplot as plt
        from sklearn.metrics import precision_recall_curve

        precision, recall, thresholds = precision_recall_curve(self.y_test, self.probs())
        no_skill = len(self.y_test[self.y_test ==1]) / len(self.y_test)

        plt.plot([0,1], [no_skill, no_skill], linestyle = '--', label = 'Classificador base')
        plt.plot(precision, recall, marker = '.', label =  self.modelname)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend()
        plt.show()
        
#     def f1():
#         return 
    
    def auc(self):
        import numpy as np
        '''AUC'''
        from sklearn.metrics import roc_auc_score
        return np.round(roc_auc_score(self.y_test, self.probs()),2)
    
    def gini(self):
        '''GINI'''
        return 2*self.auc() - 1

    def cvresults(self, cv = 10):
        '''Metricas gerais: 
        - acuracia
        - f1
        - recall
        - precision'''
        from sklearn.model_selection import cross_validate
        res = cross_validate(self.model, self.X, self.y, cv = cv, 
                                  scoring = ['accuracy','f1','recall','precision'],
                                  return_train_score = True)

        f1 = res['test_f1'].mean()
        precision = res['test_precision'].mean()
        recall = res['test_recall'].mean()
        acuracia = res['test_accuracy'].mean()

        cvresults = pd.DataFrame({'acuracia':[acuracia],
                                  'precisao':[precision],
                                  'recall':[recall],
                                  'f1':[f1]})

        return cvresults

In [75]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(14, 5)

(6, 5)

(14,)

(6,)

In [76]:
ml = ml(X_train, X_test, y_train, y_test, xgboost, 'XGBoost')



In [77]:
y_predin = ml.probs()

In [78]:
y_predin

array([0.51794606, 0.9313615 , 0.8790327 , 0.89348716, 0.9313615 ,
       0.51794606], dtype=float32)

### Fazer previsão com o outro período

In [79]:
out_

Unnamed: 0,id,periodo,comprou,dummy_a,dummy_b,dummy_c
20,382573,4,1,1,0,0
21,365640,4,1,0,0,1
...,...,...,...,...,...,...
39,476296,6,0,0,0,1
40,508304,6,1,0,1,0


In [80]:
Xout = out_.drop('comprou', axis = 1)

yout = out_['comprou']

In [81]:
y_predout = ml.probs(new_X_test = Xout)

In [82]:
y_predout

array([0.8179283 , 0.36524346, 0.40090296, 0.74718934, 0.89348716,
       0.51794606, 0.40090296, 0.36524346, 0.35195655, 0.40090296,
       0.36524346, 0.8179283 , 0.40090296, 0.36524346, 0.8179283 ,
       0.36524346, 0.40090296, 0.458084  , 0.8179283 , 0.36524346,
       0.40090296], dtype=float32)

# Agora como plotar os dois? 

### Join X_test e y_predin

In [83]:
X_test

Unnamed: 0,id,periodo,dummy_a,dummy_b,dummy_c
19,771257,3,0,0,1
7,699881,2,1,0,0
...,...,...,...,...,...
0,804591,1,1,0,0
17,709564,3,0,0,1


In [84]:
y_test

19    0
7     1
     ..
0     1
17    0
Name: comprou, Length: 6, dtype: int64

In [94]:
predin = pd.concat([X_test.reset_index(),y_test.reset_index(),pd.DataFrame(y_predin, columns = ['prob'])], axis = 1)

In [95]:
predin

Unnamed: 0,index,id,periodo,dummy_a,dummy_b,dummy_c,index.1,comprou,prob
0,19,771257,3,0,0,1,19,0,0.517946
1,7,699881,2,1,0,0,7,1,0.931361
...,...,...,...,...,...,...,...,...,...
4,0,804591,1,1,0,0,0,1,0.931361
5,17,709564,3,0,0,1,17,0,0.517946


In [98]:
predout = pd.concat([Xout.reset_index(),yout.reset_index(),pd.DataFrame(y_predout, columns = ['prob'])], axis = 1)

In [100]:
Xout.shape
yout.shape
y_predout.shape

(21, 5)

(21,)

(21,)

In [99]:
predout

Unnamed: 0,index,id,periodo,dummy_a,dummy_b,dummy_c,index.1,comprou,prob
0,20,382573,4,1,0,0,20,1,0.817928
1,21,365640,4,0,0,1,21,1,0.365243
...,...,...,...,...,...,...,...,...,...
19,39,476296,6,0,0,1,39,0,0.365243
20,40,508304,6,0,1,0,40,1,0.400903


In [103]:
stack = pd.concat([predin,predout], axis = 0)

In [104]:
stack.shape

(27, 9)

In [105]:
stack

Unnamed: 0,index,id,periodo,dummy_a,dummy_b,dummy_c,index.1,comprou,prob
0,19,771257,3,0,0,1,19,0,0.517946
1,7,699881,2,1,0,0,7,1,0.931361
...,...,...,...,...,...,...,...,...,...
19,39,476296,6,0,0,1,39,0,0.365243
20,40,508304,6,0,1,0,40,1,0.400903


In [102]:
import plotnine as pn 
from plotnine import *

# Agrupar clusters de compradores

In [111]:
problist = [x for x in np.arange(0.0,1.0,0.1)]

In [112]:
problist

[0.0,
 0.1,
 0.2,
 0.30000000000000004,
 0.4,
 0.5,
 0.6000000000000001,
 0.7000000000000001,
 0.8,
 0.9]

In [129]:
conds = [
         (stack['prob'] > 0.0) & (stack['prob'] < 0.1),
         (stack['prob'] > 0.1) & (stack['prob'] < 0.2),
         (stack['prob'] > 0.2) & (stack['prob'] < 0.3),
         (stack['prob'] > 0.3) & (stack['prob'] < 0.4),
         (stack['prob'] > 0.4) & (stack['prob'] < 0.5),
         (stack['prob'] > 0.5) & (stack['prob'] < 0.6),
         (stack['prob'] > 0.6) & (stack['prob'] < 0.7),
         (stack['prob'] > 0.7) & (stack['prob'] < 0.8),
         (stack['prob'] > 0.8) & (stack['prob'] < 0.9),
         (stack['prob'] > 0.9) & (stack['prob'] < 1.0)
         ]

In [118]:
choices = ['C'+str(i) for i in range(0,10)]

In [119]:
choices

['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']

In [136]:
stack['cluster'] = np.select(conds, choices)

# Plots

In [140]:
f()
stack
nf()

Unnamed: 0,index,id,periodo,dummy_a,dummy_b,dummy_c,index.1,comprou,prob,cluster
0,19,771257,3,0,0,1,19,0,0.517946,C5
1,7,699881,2,1,0,0,7,1,0.931361,C9
2,6,211090,2,1,0,0,6,1,0.879033,C8
3,16,742883,3,1,0,0,16,0,0.893487,C8
4,0,804591,1,1,0,0,0,1,0.931361,C9
5,17,709564,3,0,0,1,17,0,0.517946,C5
0,20,382573,4,1,0,0,20,1,0.817928,C8
1,21,365640,4,0,0,1,21,1,0.365243,C3
2,22,102210,4,0,1,0,22,0,0.400903,C4
3,23,558569,4,0,0,1,23,0,0.747189,C7


In [147]:
f()
stack.groupby(['cluster','periodo']).agg({'id':'count'}).reset_index().rename(columns = {'id':'count'})
nf()

Unnamed: 0,cluster,periodo,count
0,C3,4,1
1,C3,5,4
2,C3,6,2
3,C4,4,1
4,C4,5,3
5,C4,6,3
6,C5,3,2
7,C5,4,1
8,C7,4,1
9,C8,2,1
