# Prédiction des clients prospects 
Ce notebook est divisé sur 3 parties: 
   -données initiales
   -données initiales avec encours parrain 
   -données initiales avec insee (salaire net par code postal)

In [1]:
### Import
import sys
import csv
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors.nearest_centroid import NearestCentroid



In [18]:
### Load data
df_initial = pd.read_csv('/home/ubuntu/result.csv', sep=',')  ##Sans encours_p et insee
del df_initial['Unnamed: 0']
df_initial.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible
0,6546762003,,-1.0,Source URL non trouvée,,,59430.0,France,15.0,,...,-1.0,,FRA,2.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0
1,6742433330,,456.0,REC_Liens textes autopromotion 456,0,31130.0,38000.0,France,2.0,11/04/1969,...,2.0,FRA,FRA,4.0,2.0,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0
2,6856828837,201611.0,868.0,PAR_Onlin_Site _ECard,MR,92000.0,92150.0,France,2.0,11/05/1957,...,2.0,FRA,FRA,2.0,4.0,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1
3,6754882186,,300.0,Intb_Banque_Autre,1,94800.0,93420.0,France,2.0,24/06/1988,...,0.0,FRA,FRA,2.0,2.0,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0
4,6289817192,201611.0,300.0,Intb_Banque_Autre,MR,26170.0,84600.0,Switzerland,2.0,01/05/1994,...,0.0,FRA,FRA,0.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0


In [19]:
df_initial.count()

contactid                 44548
annee_mois                30217
campagne                  44183
campaign                  44183
civilite                  37791
code_postal               38577
code_postal_naissance     44219
country_connexion_name    43753
csp                       44548
date_naissance            38577
debit_cb                  10847
firstnamesponsor          19033
flag_banque_principale    44548
flag_epargne              44511
id_dim_temps              41565
mail                      38577
mailing_accord            38577
namesponsor               19033
nature_cb                 10847
patrimoine                44548
pays                      38577
pays_naissance            44219
regime_matrimonial        44548
revenus_annuels           44548
service                   44548
timestamp                 44548
id_dim_personne           44548
encours                   43554
cible                     44548
dtype: int64

In [20]:
del df_initial['cible']

In [21]:
#Fixer encours des bons clients 
df_initial['cible_seuil_1200'] = df_initial['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_initial[['encours','cible_seuil_1200']].head()

Unnamed: 0,encours,cible_seuil_1200
0,5.0,0
1,0.0,0
2,8899.93,1
3,10.0,0
4,173.53,0


In [22]:
df = df_initial[df_initial.id_dim_personne == 7501434192]
df.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible_seuil_1200
32050,7501434192,201703.0,868.0,PAR_Onlin_Site _ECard,MR,59800.0,80000.0,France,12.0,26/08/1995,...,0.0,FRA,FRA,0.0,0.0,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2017-03-23T17:52:54.825+0100,7501434192,0.0,0


In [23]:
df_initial.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible_seuil_1200
0,6546762003,,-1.0,Source URL non trouvée,,,59430.0,France,15.0,,...,-1.0,,FRA,2.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0
1,6742433330,,456.0,REC_Liens textes autopromotion 456,0,31130.0,38000.0,France,2.0,11/04/1969,...,2.0,FRA,FRA,4.0,2.0,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0
2,6856828837,201611.0,868.0,PAR_Onlin_Site _ECard,MR,92000.0,92150.0,France,2.0,11/05/1957,...,2.0,FRA,FRA,2.0,4.0,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1
3,6754882186,,300.0,Intb_Banque_Autre,1,94800.0,93420.0,France,2.0,24/06/1988,...,0.0,FRA,FRA,2.0,2.0,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0
4,6289817192,201611.0,300.0,Intb_Banque_Autre,MR,26170.0,84600.0,Switzerland,2.0,01/05/1994,...,0.0,FRA,FRA,0.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0


# DF sans parrain

In [24]:
### Data management
def data_management(df):
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']
    
    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)

    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sex'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    #del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    del df['contactid']
    del df['pays']
    del df['debit_cb']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    df = df.replace ([np.nan,None,''], -2)
    
    features_df = df.drop('cible_seuil_1200', axis=1)
    #features_df = features_df.drop('id_dim_personne',axis=1)
    #features_df = features_df.drop('annee_mois',axis=1)
    target = df['cible_seuil_1200']
    return features_df, target, df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 10
    elif (x=='GTO'):
        return 11
    elif (x=='SEN'):
        return 12
    elif (x=='BEL'):
        return 13
    elif (x=='CHN'):
        return 14
    elif (x=='ROU'):
        return 15
    elif (x=='BRA'):
        return 16
    elif (x=='MDG'):
        return 17
    elif (x=='PRT'):
        return 18
    elif (x=='GBR'):
        return 19
    elif (x=='LBN'):
        return 20
    elif (x=='TUR'):
        return 21
    elif (x=='IND'):
        return 22
    
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [25]:
X, y, data= data_management(df_initial)

In [26]:
data.id_dim_personne.dtype

dtype('int64')

In [27]:
#data = data.set_index('id_dim_personne')
#data = data.drop(['id_dim_personne'])
#data.reset_index(level=0, inplace=True)
#del data['index']
data.head()

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,id_dim_personne,cible_seuil_1200,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance
0,-1.0,-2.0,59430.0,15.0,1.0,0.0,-2.0,-1.0,2.0,2.0,6546762003,0,-2.0,0,-2.0,-1,1.0,1.0
1,456.0,31130.0,38000.0,2.0,0.0,0.0,1.0,2.0,4.0,2.0,6742433330,0,47.0,0,0.0,1,1.0,1.0
2,868.0,92000.0,92150.0,2.0,1.0,0.0,-2.0,2.0,2.0,4.0,6856828837,1,59.0,0,6.0,1,1.0,1.0
3,300.0,94800.0,93420.0,2.0,0.0,0.0,1.0,0.0,2.0,2.0,6754882186,0,29.0,0,1.0,0,1.0,1.0
4,300.0,26170.0,84600.0,2.0,0.0,0.0,-2.0,0.0,0.0,2.0,6289817192,0,23.0,0,3.0,1,5.0,1.0


In [28]:
test = data[data.id_dim_personne == 7501434192]
test
#X = X.set_index('id_dim_personne')

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,id_dim_personne,cible_seuil_1200,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance
32050,868.0,59800.0,80000.0,12.0,0.0,0.0,-2.0,0.0,0.0,0.0,7501434192,0,21.0,0,1.0,1,1.0,1.0


In [29]:
data.count()

campagne                  44548
code_postal               44548
code_postal_naissance     44548
csp                       44548
flag_banque_principale    44548
flag_epargne              44548
nature_cb                 44548
patrimoine                44548
regime_matrimonial        44548
revenus_annuels           44548
id_dim_personne           44548
cible_seuil_1200          44548
AGE                       44548
Parrain                   44548
domain                    44548
Sex                       44548
country_connexion         44548
pays_de_naissance         44548
dtype: int64

In [30]:
## Travailler avec nouv X et y ayant id_dim_personne comme index
data = data.set_index('id_dim_personne')
X = data.drop ('cible_seuil_1200', axis = 1)
y = data ['cible_seuil_1200']

In [32]:
#### Split fataframe (use only the ones which does not exist in kibana for the training)
df_train = pd.DataFrame()
df_test = pd.DataFrame()
if data.shape[0] > 10100: # len(df) > 10 would also work
    df_train = data[:10100]
    df_test = data[10100:]

In [135]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=54)
X_train = df_train.drop('cible_seuil_1200',axis=1)
X_test = df_test.drop('cible_seuil_1200',axis=1)
y_train = df_train['cible_seuil_1200']
y_test = df_test['cible_seuil_1200']
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10100, 16)
(27070, 16)
(10100,)
(27070,)


In [136]:
X_test.head()

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6805268703,868,79120,79000,16,0,0,0,0,2,1,37,1,0,0,1,1
6737254579,868,69003,12400,13,1,1,1,0,6,2,33,1,3,1,1,1
6965233196,868,94420,94500,2,0,0,-2,0,2,2,47,1,3,1,1,1
6742384047,868,84440,75015,3,1,1,-2,0,0,2,37,1,2,0,1,1
6952035876,868,78920,92150,2,0,0,-2,1,2,3,35,1,16,1,1,1


In [152]:
### Proba approach
def proba_approche(m0,X0_test,y0_test):
    proba0=pd.DataFrame(y0_test)
    P = m0.predict_proba(X0_test)[:, 1]
    proba0["probabilite"]= P

    decoup=pd.qcut(proba0['probabilite'],10,labels=[9,8,7,6,5,4,3,2,1,0])
    decoup
    U=pd.DataFrame(decoup)

    U.rename(columns={'probabilite':"decile"},inplace=True)
    decile=pd.concat([proba0,U],axis=1)
    T1=decile['probabilite'].groupby(decile['decile'])
    T2=pd.DataFrame(T1.min())
    seuil = T2.tail(1).probabilite
    seuil = float(seuil)
    proba0=pd.DataFrame(y0_test)
    P = m0.predict_proba(X0_test)[:, 1]

    proba0["probabilite"]= P
    proba0['y_pred'] = proba0['probabilite'].map(lambda x: 1 if x > seuil else 0)
    proba0
    print "Seuil de probabilité : "+str(seuil)
    print "Accuracy : "+str(accuracy_score(proba0.cible_seuil_1200, proba0.y_pred))
    print "Recall : "+str(recall_score(proba0.cible_seuil_1200,proba0.y_pred))
    print "Confusion matrix :\n %s " % confusion_matrix(proba0.cible_seuil_1200,proba0.y_pred)
    print('classification report:\n %s' % classification_report(proba0.cible_seuil_1200,proba0.y_pred))


In [138]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 1.5)
KNC.fit(X_train, y_train)

NearestCentroid(metric='euclidean', shrink_threshold=1.5)

In [139]:
# Concéténer X et y test en fct de l'id_dim_personne
df_total = pd.concat([X_test, y_test], axis=1)
df_total.head()

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6805268703,868,79120,79000,16,0,0,0,0,2,1,37,1,0,0,1,1,0
6737254579,868,69003,12400,13,1,1,1,0,6,2,33,1,3,1,1,1,0
6965233196,868,94420,94500,2,0,0,-2,0,2,2,47,1,3,1,1,1,1
6742384047,868,84440,75015,3,1,1,-2,0,0,2,37,1,2,0,1,1,1
6952035876,868,78920,92150,2,0,0,-2,1,2,3,35,1,16,1,1,1,0


In [140]:
df_total["cible_predit_sans_encoursParrain_sans_insee"] = KNC.predict(X_test)
#df_total["cible_predit_sans_encoursParrain_sans_insee"] = KNC.predict(X)
df_total

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200,cible_predit_sans_encoursParrain_sans_insee
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6805268703,868,79120,79000,16,0,0,0,0,2,1,37,1,0,0,1,1,0,1
6737254579,868,69003,12400,13,1,1,1,0,6,2,33,1,3,1,1,1,0,0
6965233196,868,94420,94500,2,0,0,-2,0,2,2,47,1,3,1,1,1,1,1
6742384047,868,84440,75015,3,1,1,-2,0,0,2,37,1,2,0,1,1,1,1
6952035876,868,78920,92150,2,0,0,-2,1,2,3,35,1,16,1,1,1,0,1
6727806928,456,78930,75013,2,0,0,1,2,2,2,42,0,8,1,1,1,0,1
6925514520,868,75013,94410,4,1,1,-2,0,2,2,54,1,3,1,1,1,0,1
6720552356,300,1600,99,6,0,0,0,2,4,1,66,1,3,0,1,4,1,0
6901194811,1482,94000,99,7,1,1,-2,0,2,1,33,0,0,0,1,1,1,0
6445711388,868,77210,94130,12,0,1,-2,0,0,1,23,1,3,1,1,1,0,1


In [141]:
print(classification_report(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee))
print ('accuracy score : '+ str(accuracy_score(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee)))

             precision    recall  f1-score   support

          0       0.78      0.45      0.57     21069
          1       0.23      0.57      0.32      6001

avg / total       0.66      0.47      0.51     27070

accuracy score : 0.473254525305

 confussion matrix:
[[ 9399 11670]
 [ 2589  3412]]


In [142]:
df_total = df_total.reset_index()

In [143]:
test = df_total[df_total.id_dim_personne == 7501440616]
test

Unnamed: 0,id_dim_personne,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200,cible_predit_sans_encoursParrain_sans_insee
19541,7501440616,868,78800,78600,2,0,1,-2,0,1,2,35,1,3,1,1,1,0,1


In [154]:
### Modèle 1 : RDF
classifier= RandomForestClassifier(n_estimators=210,max_depth=210)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=210, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=210, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [145]:
df_total["cible_predit_sans_encoursParrain_sans_insee_2"] = classifier.predict(X_test)
df_total.head()

Unnamed: 0,id_dim_personne,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200,cible_predit_sans_encoursParrain_sans_insee,cible_predit_sans_encoursParrain_sans_insee_2
0,6805268703,868,79120,79000,16,0,0,0,0,2,1,37,1,0,0,1,1,0,1,0
1,6737254579,868,69003,12400,13,1,1,1,0,6,2,33,1,3,1,1,1,0,0,0
2,6965233196,868,94420,94500,2,0,0,-2,0,2,2,47,1,3,1,1,1,1,1,0
3,6742384047,868,84440,75015,3,1,1,-2,0,0,2,37,1,2,0,1,1,1,1,0
4,6952035876,868,78920,92150,2,0,0,-2,1,2,3,35,1,16,1,1,1,0,1,0


In [155]:
print ('accuracy score : '+ str(accuracy_score(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee_2)))
print('classification_report:'+ str(classification_report(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee_2)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee_2)))

accuracy score : 0.762467676395
classification_report:             precision    recall  f1-score   support

          0       0.79      0.95      0.86     21069
          1       0.36      0.09      0.14      6001

avg / total       0.69      0.76      0.70     27070


 confussion matrix:
[[20098   971]
 [ 5459   542]]


In [156]:
proba_approche(classifier, X_test, y_test)

Seuil de probabilité : 0.461904761905
Accuracy : 0.751606944958
Recall : 0.147142142976
Confusion matrix :
 [[19463  1606]
 [ 5118   883]] 
classification report:
              precision    recall  f1-score   support

          0       0.79      0.92      0.85     21069
          1       0.35      0.15      0.21      6001

avg / total       0.69      0.75      0.71     27070



In [149]:
df_total = df_total.reset_index()

### Indexing in ES

In [150]:
__author__ = 'ubuntu'

# Cassandra metadata
dev_cassandra_host= 'dtl-cassandra01-d01'
dev_cassandra_port= 9200
dev_username= 'loaddata'
dev_password= 'DecujRiQuigByaibdednofVerr6Odij2'

# Elasticsearch hostname
dev_es_host= 'dtl-esmaster01-d01'

# Elasticsearch port
dev_es_port= 9200
es_login= 'dtl-spark'
es_password= 'taquivvukyuztAckufneglugfisipBio'
    
# Path to pickle one month 'encours' and csv matching files
path_to_one_month_pickle = "./MODEL/ENCOURS_1MOIS_v2/"

# One month model target name
one_month_target_name = 'ENCOURS_1MOIS_v2'

In [151]:
df_to_be_indexed = df_total[['id_dim_personne','cible_seuil_1200','cible_predit_sans_encoursParrain_sans_insee']]
print(df_to_be_indexed)

       id_dim_personne  cible_seuil_1200  \
0           6805268703                 0   
1           6737254579                 0   
2           6965233196                 1   
3           6742384047                 1   
4           6952035876                 0   
5           6727806928                 0   
6           6925514520                 0   
7           6720552356                 1   
8           6901194811                 1   
9           6445711388                 0   
10          6777014493                 0   
11          6852106646                 0   
12          6890024390                 0   
13          6546396365                 0   
14          6873076539                 0   
15          6965345065                 0   
16          6866867675                 0   
17          6810957433                 0   
18          6747582036                 0   
19          6914157143                 1   
20          6701521848                 0   
21          6888305368          

In [33]:
df_es = df_to_be_indexed[df_to_be_indexed.id_dim_personne == 7501471010]
df_es

Unnamed: 0,id_dim_personne,cible_seuil_1200,cible_predit_sans_encoursParrain_sans_insee
19039,7501471010,0,1


In [27]:
### Create ES Instance ###
from elasticsearch import Elasticsearch
import math

class ElasticsearchClient:

    def __init__(self):
        self.host = 'dtl-esmaster01-d01'
        self.port = 9200
        self.user = 'dtl-spark'
        self.secret = 'taquivvukyuztAckufneglugfisipBio'
        self.session = None

    def create_session(self):
        self.session = Elasticsearch(self.host,
                                     http_auth=(self.user, self.secret),
                                     use_ssl=True,
                                     verify_certs=False,
                                     sniff_on_start=True,
                                     sniff_on_connection_fail=True,
                                     sniffer_timeout=60)

    def index_purgatory(self,index):
        self.session.indices.delete(index=index)

    def index_table(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            index_stmt = self.session.index(index=index_name, doc_type=doc_type, body=data_dict, id=data_dict[doc_id])

    def convert_float(self, number):
        if math.isnan(number) :
            return '201611'
        else :
            return str(int(number))

    def update_index(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            row['annee_mois'] = self.convert_float(row['annee_mois'])
            #del row['annee_mois']
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            update_stmt = self.session.update(index=index_name + row['annee_mois'], doc_type=doc_type, body = {"doc": data_dict, "doc_as_upsert":True} , id=data_dict[doc_id]) #, "index.mapping.ignore_malformed":True
            ###es.update.retry.on.conflict

    def update_index_retro(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            update_stmt = self.session.update(index=index_name, doc_type=doc_type, body = {"doc": data_dict, "doc_as_upsert":True} , id=data_dict[doc_id]) #, "index.mapping.ignore_malformed":True
            ###es.update.retry.on.conflict

    def create_indices(self):
        try:
            self.session.indices.create(index='index', ignore=400)
        except elasticsearch.ElasticsearchException as es1:
            print('error es')





In [28]:
### Create ES Instance ###
es = ElasticsearchClient()
es.create_session()

  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)


In [None]:
#Insert into es v2
es.update_index_retro(df_to_be_indexed, 'retro', 'retro', 'id_dim_personne')

  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)


In [43]:
#df = df_to_be_indexed[df_to_be_indexed.id_dim_personne [7500945441, 7500854058,7501227214] ]

KeyError: (7500945441, 7500854058, 7501227214)

In [130]:
### Timestamp to be indexde
df_init = pd.read_csv('result.csv', sep=',')
del df_init ["Unnamed: 0"]
df_timestamp = df_init[['annee_mois','contactid']]
df_timestamp.head()

Unnamed: 0,annee_mois,contactid
0,,6546762003
1,,6742433330
2,201611.0,6856828837
3,,6754882186
4,201611.0,6289817192


In [25]:
test = data[data.id_dim_personne== 7500964054]
test

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,id_dim_personne,cible,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_predit_sans_encoursParrain_sans_insee,y_pred
41788,868,93600,91130,13,1,1,-2,1,1,2,7500964054,1,35,1,3,1,-2,1,1,1


In [35]:
es = ElasticsearchClient()

In [45]:
### Create ES Instance ###
from elasticsearch import Elasticsearch
import math

class ElasticsearchClient:

    def __init__(self):
        self.host = 'dtl-esmaster01-d01'
        self.port = 9200
        self.user = 'dtl-spark'
        self.secret = 'taquivvukyuztAckufneglugfisipBio'
        self.session = None

    def create_session(self):
        self.session = Elasticsearch(self.host,
                                     http_auth=(self.user, self.secret),
                                     use_ssl=True,
                                     verify_certs=False,
                                     sniff_on_start=True,
                                     sniff_on_connection_fail=True,
                                     sniffer_timeout=60)

    def index_purgatory(self,index):
        self.session.indices.delete(index=index)

    def index_table(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            index_stmt = self.session.index(index=index_name, doc_type=doc_type, body=data_dict, id=data_dict[doc_id])

    def convert_float(self, number):
        if math.isnan(number) :
            return '201611'
        else :
            return str(int(number))

    def update_index(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            row['annee_mois'] = self.convert_float(row['annee_mois'])
            #del row['annee_mois']
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            update_stmt = self.session.update(index=index_name + row['annee_mois'], doc_type=doc_type, body = {"doc": data_dict, "doc_as_upsert":True} , id=data_dict[doc_id]) #, "index.mapping.ignore_malformed":True
            ###es.update.retry.on.conflict

    def update_index_retro(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            update_stmt = self.session.update(index=index_name, doc_type=doc_type, body = {"doc": data_dict, "doc_as_upsert":True} , id=data_dict[doc_id]) #, "index.mapping.ignore_malformed":True
            ###es.update.retry.on.conflict

    def create_indices(self):
        try:
            self.session.indices.create(index='index', ignore=400)
        except elasticsearch.ElasticsearchException as es1:
            print('error es')

In [36]:
#Insert into es v2
es.update_index_retro(df_es, 'retro', 'retro', 'id_dim_personne')

# Avec encours parrain

In [38]:
# df_init_p = pd.read_csv('df_new_initial.csv', sep=',') 
df_init_p = pd.read_csv('/home/ubuntu/result.csv', sep=',') 
del df_init_p ['Unnamed: 0']
del df_init_p ['cible']
df_init_p.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,nature_cb,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours
0,6546762003,,-1.0,Source URL non trouvée,,,59430.0,France,15.0,,...,,-1.0,,FRA,2.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0
1,6742433330,,456.0,REC_Liens textes autopromotion 456,0,31130.0,38000.0,France,2.0,11/04/1969,...,1.0,2.0,FRA,FRA,4.0,2.0,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0
2,6856828837,201611.0,868.0,PAR_Onlin_Site _ECard,MR,92000.0,92150.0,France,2.0,11/05/1957,...,,2.0,FRA,FRA,2.0,4.0,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93
3,6754882186,,300.0,Intb_Banque_Autre,1,94800.0,93420.0,France,2.0,24/06/1988,...,1.0,0.0,FRA,FRA,2.0,2.0,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0
4,6289817192,201611.0,300.0,Intb_Banque_Autre,MR,26170.0,84600.0,Switzerland,2.0,01/05/1994,...,,0.0,FRA,FRA,0.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53


In [39]:
df_init_p.contactid.count()

44548

In [24]:
#Fixer encours des bons clients 
df_init_p['cible'] = df_init_p['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_init_p[['encours','cible']].head()

Unnamed: 0,encours,cible
0,5.0,0
1,0.0,0
2,8899.93,1
3,10.0,0
4,173.53,0


In [25]:
### Data management
def data_management_p(df):
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']
    
    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)

    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sex'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    #del df['contactid']
    del df['pays']
    del df['debit_cb']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    df = df.dropna()
    
    #features_df = df.drop('cible_seuil_1200', axis=1)
    #features_df = features_df.drop('id_dim_personne',axis=1)
    #features_df = features_df.drop('annee_mois',axis=1)
    #target = df['cible_seuil_1200']
    return df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 10
    elif (x=='GTO'):
        return 11
    elif (x=='SEN'):
        return 12
    elif (x=='BEL'):
        return 13
    elif (x=='CHN'):
        return 14
    elif (x=='ROU'):
        return 15
    elif (x=='BRA'):
        return 16
    elif (x=='MDG'):
        return 17
    elif (x=='PRT'):
        return 18
    elif (x=='GBR'):
        return 19
    elif (x=='LBN'):
        return 20
    elif (x=='TUR'):
        return 21
    elif (x=='IND'):
        return 22
    
    
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [26]:
data_p = data_management_p(df_init_p)

In [27]:
data_p.contactid.count()

37170

In [28]:
Parrain_df = pd.read_csv('parrain_totale_afteradd14.csv', sep=',')
del Parrain_df['Unnamed: 0']
Parrain_df.head()

Unnamed: 0,id_dim_personne,encours_parrain,id_parrain
0,1004250808,2475.93,224108008.0
1,1660202716,,
2,1715100837,2736.14,1697958997.0
3,1725735525,,
4,1748840201,1311.33,814632101.0


In [29]:
Parrain_df.id_dim_personne

0        1004250808
1        1660202716
2        1715100837
3        1725735525
4        1748840201
5        1766903833
6        1887361163
7        1989548372
8        2232679265
9        2260529045
10       2364644331
11       2381429055
12       2997328175
13       3294911605
14       3491258034
15       3705902288
16       3878342197
17       4141010655
18       4223249842
19       4252943190
20       4253839485
21       4406550122
22       4409310817
23       4428098087
24       4544429677
25       4571148440
26       4613670658
27       4881973309
28       4899596466
29       4900569192
            ...    
44523    7641545691
44524    7641546336
44525    7641551329
44526    7641552635
44527    7641555335
44528    7641557844
44529    7641568813
44530    7641570793
44531    7641570942
44532    7641571763
44533    7641573352
44534    7641573440
44535    7641574301
44536    7641574586
44537    7641576053
44538    7641576965
44539    7641577491
44540    7641577696
44541    7641579602


In [30]:
# Merge parrain id with DF initial selon l'id client
enriched_df = pd.merge(data_p, Parrain_df, right_on='id_dim_personne', left_on='contactid')
enriched_df.head()

Unnamed: 0,contactid,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,...,cible,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,id_dim_personne,encours_parrain,id_parrain
0,6742433330,456,31130,38000,2,0,0,1,2,4,...,0,47,0,0,1,1,1,6742433330,19144.72,5516969255.0
1,6856828837,868,92000,92150,2,1,0,-2,2,2,...,1,59,0,6,1,1,1,6856828837,2966.95,6440631011.0
2,6754882186,300,94800,93420,2,0,0,1,0,2,...,0,28,0,1,0,1,1,6754882186,,
3,6289817192,300,26170,84600,2,0,0,-2,0,0,...,0,23,0,3,1,5,1,6289817192,25.02,2989870176.0
4,6748699778,307,73150,92290,2,0,0,1,3,2,...,0,54,0,2,1,1,1,6748699778,,


In [31]:
enriched_df.contactid

0        6742433330
1        6856828837
2        6754882186
3        6289817192
4        6748699778
5        6856855432
6        6821231675
7        6747566365
8        6888675097
9        6879474578
10       6821501450
11       6873248390
12       6901251821
13       6946289020
14       6879474016
15       6701524556
16       6499940544
17       6778087634
18       6867333056
19       7037628291
20       6738608856
21       6926814991
22       6978234890
23       5648776005
24       6866885731
25       6925436700
26       7042925233
27       6890442660
28       6805307194
29       6860706007
            ...    
37139    7625987083
37140    7423322073
37141    7317166394
37142    7323234198
37143    7626406149
37144    7239231161
37145    7640098885
37146    7610450855
37147    7273702731
37148    7365966767
37149    7611590755
37150    7423209388
37151    7388614691
37152    7590808167
37153    7318877117
37154    7372077047
37155    7387823123
37156    7492899162
37157    7439257049


In [32]:
### Data management parrain
def data_management_parrain(df):
    
    ### Get flag parrain from id parrain
    df['Parrain'] = df['id_parrain'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['id_parrain']
    

    ### Get flag encours_parrain (1000) from encours_parrain 
    df['encours_p'] = df['encours_parrain'].map(lambda x : 0 if x<1500 or pd.isnull(x) else 1).astype(int)
    del df['encours_parrain']
    
    ### USELESS COLUMNS
    
    del df['contactid']
    #del df['id_dim_personne'] 
    
    #features_df = df.drop('cible', axis=1)
    #target = df['cible']
    
    #return features_df, target
    return df

In [33]:
#X_parrain, y_parrain = data_management_parrain(enriched_df)
df_parrain = data_management_parrain(enriched_df)

In [34]:
df_parrain.head()

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,cible,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,id_dim_personne,encours_p
0,456,31130,38000,2,0,0,1,2,4,2,0,47,1,0,1,1,1,6742433330,1
1,868,92000,92150,2,1,0,-2,2,2,4,1,59,1,6,1,1,1,6856828837,1
2,300,94800,93420,2,0,0,1,0,2,2,0,28,0,1,0,1,1,6754882186,0
3,300,26170,84600,2,0,0,-2,0,0,2,0,23,1,3,1,5,1,6289817192,0
4,307,73150,92290,2,0,0,1,3,2,2,0,54,0,2,1,1,1,6748699778,0


In [35]:
df_parrain.count()

campagne                  37169
code_postal               37169
code_postal_naissance     37169
csp                       37169
flag_banque_principale    37169
flag_epargne              37169
nature_cb                 37169
patrimoine                37169
regime_matrimonial        37169
revenus_annuels           37169
cible                     37169
AGE                       37169
Parrain                   37169
domain                    37169
Sex                       37169
country_connexion         37169
pays_de_naissance         37169
id_dim_personne           37169
encours_p                 37169
dtype: int64

In [36]:
df_parrain = df_parrain.set_index('id_dim_personne')

In [37]:
X_parrain = df_parrain.drop('cible', axis=1)
y_parrain = df_parrain['cible']

In [38]:
#### Split fataframe
df_train = pd.DataFrame()
df_test = pd.DataFrame()
if df_parrain.shape[0] > 10010: # len(df) > 10 would also work
    df_train = df_parrain[:10010]
    df_test = df_parrain[10010:]

In [42]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=54)
X_train = df_train.drop('cible',axis=1)
X_test = df_test.drop('cible',axis=1)
y_train = df_train['cible']
y_test = df_test['cible']
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10010, 17)
(27159, 17)
(10010,)
(27159,)


In [41]:
#X02_train, X02_test, y02_train, y02_test = train_test_split (X_parrain,y_parrain,test_size=0.3,random_state=57)
#X02_train, X02_test, y02_train, y02_test = train_test_split (X_parrain,y_parrain,random_state=42)

In [44]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC2 = NearestCentroid(metric='euclidean', shrink_threshold = 1.5)
KNC2.fit(X_train, y_train)


NearestCentroid(metric='euclidean', shrink_threshold=1.5)

In [45]:
df_concat = pd.concat([X_test, y_test], axis=1)

In [46]:
df_concat['cible_predit_avec_encoursParrain'] = KNC2.predict(X_test)
df_concat 

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,encours_p,cible,cible_predit_avec_encoursParrain
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5694140010,307,34070,7200,16,1,0,-2,1,0,1,56,1,3,0,1,1,1,0,0
7046030727,868,69420,69700,16,0,0,-2,1,0,1,28,1,1,0,1,1,0,0,1
6709991216,868,38500,38700,4,0,0,1,0,6,2,26,1,1,-1,1,1,1,0,0
6778235111,456,68210,68000,16,0,0,0,1,2,1,32,1,3,1,1,1,0,0,1
6772237991,868,67205,99,18,1,1,0,0,2,0,30,1,0,0,1,17,1,1,0
6821218312,868,97300,77000,4,0,1,1,1,6,3,28,1,2,0,1,1,1,1,1
6734366998,868,83500,56120,16,1,0,1,0,1,2,44,1,7,1,1,1,0,1,1
6888776275,456,69008,69150,2,1,1,-2,0,0,2,24,0,16,0,1,1,0,0,1
6742570369,868,49220,49500,4,0,0,0,0,0,1,25,1,1,1,1,1,1,0,0
6927021133,868,49610,49000,4,1,1,-2,1,2,1,37,1,6,1,1,1,1,1,0


In [47]:
print(classification_report(y_test, df_concat.cible_predit_avec_encoursParrain))
print ('accuracy score : '+ str(accuracy_score(y_test, df_concat.cible_predit_avec_encoursParrain)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, df_concat.cible_predit_avec_encoursParrain)))

             precision    recall  f1-score   support

          0       0.78      0.45      0.57     21129
          1       0.23      0.57      0.32      6030

avg / total       0.66      0.47      0.51     27159

accuracy score : 0.47380242277

 confussion matrix:
[[ 9429 11700]
 [ 2591  3439]]


In [48]:
df_concat.reset_index(level=0, inplace=True)

In [49]:
df_concat

Unnamed: 0,id_dim_personne,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,encours_p,cible,cible_predit_avec_encoursParrain
0,5694140010,307,34070,7200,16,1,0,-2,1,0,1,56,1,3,0,1,1,1,0,0
1,7046030727,868,69420,69700,16,0,0,-2,1,0,1,28,1,1,0,1,1,0,0,1
2,6709991216,868,38500,38700,4,0,0,1,0,6,2,26,1,1,-1,1,1,1,0,0
3,6778235111,456,68210,68000,16,0,0,0,1,2,1,32,1,3,1,1,1,0,0,1
4,6772237991,868,67205,99,18,1,1,0,0,2,0,30,1,0,0,1,17,1,1,0
5,6821218312,868,97300,77000,4,0,1,1,1,6,3,28,1,2,0,1,1,1,1,1
6,6734366998,868,83500,56120,16,1,0,1,0,1,2,44,1,7,1,1,1,0,1,1
7,6888776275,456,69008,69150,2,1,1,-2,0,0,2,24,0,16,0,1,1,0,0,1
8,6742570369,868,49220,49500,4,0,0,0,0,0,1,25,1,1,1,1,1,1,0,0
9,6927021133,868,49610,49000,4,1,1,-2,1,2,1,37,1,6,1,1,1,1,1,0


In [50]:
df_to_be_indexed = df_concat[['id_dim_personne','cible_predit_avec_encoursParrain']]
print(df_to_be_indexed)

       id_dim_personne  cible_predit_avec_encoursParrain
0           5694140010                                 0
1           7046030727                                 1
2           6709991216                                 0
3           6778235111                                 1
4           6772237991                                 0
5           6821218312                                 1
6           6734366998                                 1
7           6888776275                                 1
8           6742570369                                 0
9           6927021133                                 0
10          2848208135                                 0
11          6890433090                                 1
12          6835093021                                 1
13          6975440850                                 0
14          6926805237                                 0
15          6890247395                                 0
16          6925129218         

In [51]:
#Insert into es v2
es.update_index_retro(df_to_be_indexed, 'retro', 'retro', 'id_dim_personne')

NameError: name 'es' is not defined

In [132]:
# Modèle 1 : RDF
forest_b= RandomForestClassifier(n_estimators=200,max_depth=201)
forest_b.fit(X02_train, y02_train)
y_pred_b = forest_b.predict(X02_test)
print ('accuracy score : '+ str(accuracy_score(y02_test, y_pred_b)))
print('\n classification_report:\n'+ str(classification_report(y02_test, y_pred_b)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y02_test, y_pred_b)))

accuracy score : 0.750398089172

 classification_report:
             precision    recall  f1-score   support

          0       0.77      0.96      0.85      1886
          1       0.50      0.11      0.18       626

avg / total       0.70      0.75      0.69      2512


 confussion matrix:
[[1815   71]
 [ 556   70]]


In [36]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC2 = NearestCentroid(metric='euclidean', shrink_threshold = 0.5)
KNC2.fit(X02_train, y02_train)

NearestCentroid(metric='euclidean', shrink_threshold=0.5)

In [105]:
## save the model to disk
import pickle
with open('Model_v2.pkl', 'wb') as f:
    pickle.dump(KNC2, f)

In [66]:
df_concat.reset_index(level=0, inplace=True)

In [68]:
df_concat.head()

Unnamed: 0,id_dim_personne,campagne,code_postal,code_postal_naissance,csp,debit_cb,flag_banque_principale,flag_epargne,nature_cb,patrimoine,...,revenus_annuels,AGE,domain,Sex,country_connexion,pays_de_naissance,Parrain,encours_p,cible,cible_predit_avec_encoursParrain
0,6827320758,1543,93240,93300,4,2,0,0,0,0,...,1,30,1,1,1,1,0,0,0,0
1,6716055371,868,94160,34000,12,2,0,1,0,0,...,1,28,3,-1,1,1,1,1,1,0
2,6814374196,307,3450,6000,6,5,1,1,0,0,...,0,66,14,1,1,1,0,0,0,1
3,6786451131,456,72100,85000,2,0,0,0,1,0,...,2,39,3,1,1,1,0,0,0,1
4,6727462381,300,69340,76130,15,0,1,0,1,0,...,2,28,1,1,1,1,0,0,1,1


In [55]:
df_to_be_indexed = df_concat[['id_dim_personne','cible_predit_avec_encoursParrain']]
print(df_to_be_indexed)

       id_dim_personne  cible_predit_avec_encoursParrain
0           5694140010                                 0
1           7046030727                                 1
2           6709991216                                 0
3           6778235111                                 1
4           6772237991                                 0
5           6821218312                                 1
6           6734366998                                 1
7           6888776275                                 1
8           6742570369                                 0
9           6927021133                                 0
10          2848208135                                 0
11          6890433090                                 1
12          6835093021                                 1
13          6975440850                                 0
14          6926805237                                 0
15          6890247395                                 0
16          6925129218         

In [None]:
df_es = df_to_be_indexed[df_to_be_indexed.id_dim_personne == 7501471010]
df_es

In [None]:
#Insert into es v2
es.update_index_retro(df_to_be_indexed, 'retro', 'retro', 'id_dim_personne')

  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)


In [151]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC2 = NearestCentroid(metric='euclidean', shrink_threshold = 0.5)
KNC2.fit(X, y)
y31_pred = KNC2.predict(X)

print(classification_report(y, y31_pred))
print ('accuracy score : '+ str(accuracy_score(y, y31_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y, y31_pred)))

             precision    recall  f1-score   support

          0       0.80      0.42      0.55     28766
          1       0.24      0.64      0.35      8404

avg / total       0.68      0.47      0.51     37170

accuracy score : 0.469814366425

 confussion matrix:
[[12058 16708]
 [ 2999  5405]]


In [147]:
# Modèle 1 : RDF
forest_b= RandomForestClassifier(n_estimators=200,max_depth=201)
forest_b.fit(X, y)
y_pred_b = forest_b.predict(X)
print ('accuracy score : '+ str(accuracy_score(y, y_pred_b)))
print('\n classification_report:\n'+ str(classification_report(y, y_pred_b)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y, y_pred_b)))

accuracy score : 1.0

 classification_report:
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     28766
          1       1.00      1.00      1.00      8404

avg / total       1.00      1.00      1.00     37170


 confussion matrix:
[[28766     0]
 [    0  8404]]


## INSEE

In [12]:
### Load data
df_initial_insee = pd.read_csv('/home/ubuntu/Revenue.csv', sep=',')  
del df_initial_insee['Unnamed: 0']
df_initial_insee.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible,Code_postal,Salaire_net_horaire_moyen_en_2014_euro
0,6546762003,,-1.0,Source URL non trouvée,,,59430.0,France,15.0,,...,FRA,2.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0,,
1,6742433330,,456.0,REC_Liens textes autopromotion 456,0,31130.0,38000.0,France,2.0,11/04/1969,...,FRA,4.0,2.0,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0,31130.0,17.6
2,6856828837,201611.0,868.0,PAR_Onlin_Site _ECard,MR,92000.0,92150.0,France,2.0,11/05/1957,...,FRA,2.0,4.0,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1,92000.0,16.1
3,6754882186,,300.0,Intb_Banque_Autre,1,94800.0,93420.0,France,2.0,24/06/1988,...,FRA,2.0,2.0,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0,94800.0,14.6
4,6289817192,201611.0,300.0,Intb_Banque_Autre,MR,26170.0,84600.0,Switzerland,2.0,01/05/1994,...,FRA,0.0,2.0,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0,26170.0,11.7


In [13]:
df_initial_insee.count()

contactid                                 44548
annee_mois                                30217
campagne                                  44183
campaign                                  44183
civilite                                  37791
code_postal                               38577
code_postal_naissance                     44219
country_connexion_name                    43753
csp                                       44548
date_naissance                            38577
debit_cb                                  10847
firstnamesponsor                          19033
flag_banque_principale                    44548
flag_epargne                              44511
id_dim_temps                              41565
mail                                      38577
mailing_accord                            38577
namesponsor                               19033
nature_cb                                 10847
patrimoine                                44548
pays                                    

In [14]:
df_initial_insee = df_initial_insee.set_index('contactid')

In [15]:
del df_initial_insee['cible']

In [16]:
#Fixer encours des bons clients 
df_initial_insee['cible_seuil_1200'] = df_initial_insee['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_initial_insee[['encours','cible_seuil_1200']].head()

Unnamed: 0_level_0,encours,cible_seuil_1200
contactid,Unnamed: 1_level_1,Unnamed: 2_level_1
6546762003,5.0,0
6742433330,0.0,0
6856828837,8899.93,1
6754882186,10.0,0
6289817192,173.53,0


In [17]:
df = df_initial_insee[df_initial_insee.id_dim_personne == 6598695066]
df.head()

Unnamed: 0_level_0,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,debit_cb,...,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,Code_postal,Salaire_net_horaire_moyen_en_2014_euro,cible_seuil_1200
contactid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6598695066,,300.0,Intb_Banque_Autre,0,44320.0,67000.0,France,2.0,05/03/1978,0.0,...,FRA,2.0,2.0,SFOL_AJOUTER_CB,2016-11-06T21:16:32.985+0100,6598695066,5462.85,44320.0,11.7,1


In [18]:
### Data management
def data_management(df):
    
    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()   

    ### Get Sex from Civility
    df['Sexe'] = df['civilite'].map(get_sex)
    
    ### Delete useless cols
    del df['civilite']
    del df['mail']
    del df['annee_mois']
    del df['mailing_accord']
    del df['id_dim_temps']
    del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    #del df['contactid']
    del df['pays']
    del df['debit_cb']
    del df['code_postal']
    del df['nature_cb']
    
    ### Impute empty rows  
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    df = df.replace ([np.nan,None,''], -2)
    
    ### Get age from birthdate
    get_age(df)

    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    
    ### filter the most reccurent ones
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
     
    ### Get country connexion name
    df = process_country_connexion (df)
    
    ### Get pays de naissance
    df = process_pays_naissance(df)
    #df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    #del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    #df = df.dropna()
    df = df.replace ([np.nan,None,''], -2)
    
    features_df = df.drop('cible_seuil_1200', axis=1)
    target = df['cible_seuil_1200']
    return features_df, target, df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']

def process_country_connexion(df):
    """
    Process country_connexion
    """
    dict_ = {'France':1, 'United Kingdom':2, 'other': 3, 'Germany': 4, 'Switzerland': 5, 'Netherlands': 6, 'Europe': 7, 'United States': 8, 'Reunion': 9, -2: -2}
    df['country_connexion'] = df['country_connexion_name'].map(dict_)
    del df['country_connexion_name']
    return df
    
def process_pays_naissance(df):
    """
    Process pays naissance
    """
    dict_ = {'FRA':1, 'ITA':2, 'other': 3, 'DZA': 4, 'BEN': 5, 'ESP': 6, 'VNM': 7, 'DEU': 8, 'MAR': 9, 'CIV': 10, 'CMR': 11, 'GTO': 12,
            'SEN': 13,'BEL': 14, 'CHN': 15, 'ROU':16, 'BRA': 17, 'MDG': 18,
            'PRT': 19, 'GBR': 20, 'LBN': 21, 'TUR': 22, 'IND': 23}
    df['pays_de_naissance'] = df['pays_naissance'].map(dict_)
    del df['pays_naissance']
    return df
  
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [19]:
X_insee, y_insee, data_insee= data_management(df_initial_insee)

In [20]:
data_insee.count()

campagne                                  44548
code_postal_naissance                     44548
csp                                       44548
flag_banque_principale                    44548
flag_epargne                              44548
patrimoine                                44548
regime_matrimonial                        44548
revenus_annuels                           44548
Code_postal                               44548
Salaire_net_horaire_moyen_en_2014_euro    44548
cible_seuil_1200                          44548
Parrain                                   44548
domain                                    44548
Sexe                                      44548
AGE                                       44548
country_connexion                         44548
pays_de_naissance                         44548
dtype: int64

In [52]:
print ("X", X_insee.shape)
print ("Y", y_insee.shape)

('X', (44548, 16))
('Y', (44548,))


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_insee, y_insee, test_size=0.30, random_state=54)

In [23]:
# Modèle 1: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 0.1)
KNC.fit(X_train, y_train)
y_pred = KNC.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.78      0.45      0.57     10387
          1       0.23      0.56      0.32      2978

avg / total       0.66      0.47      0.52     13365

accuracy score : 0.474074074074

 confussion matrix:
[[4673 5714]
 [1315 1663]]


# Save model

In [60]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 0.5)
KNC.fit(X_insee, y_insee)

NearestCentroid(metric='euclidean', shrink_threshold=0.5)

In [37]:
## save the model to disk
#import cPickle
#with open('Model_v2.pkl', 'wb') as f:
#    cPickle.dump(KNC, f)

# Pickle saved

In [25]:
# Modèle 2 : RDF
forest_b= RandomForestClassifier(n_estimators=200,max_depth=201)
forest_b.fit(X_train, y_train)
cible_predit_avec_INSEE = forest_b.predict(X_test)

In [26]:
print ('accuracy score : '+ str(accuracy_score(y_test, cible_predit_avec_INSEE)))
print('\n classification_report:\n'+ str(classification_report(y_test, cible_predit_avec_INSEE)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, cible_predit_avec_INSEE)))

accuracy score : 0.767377478489

 classification_report:
             precision    recall  f1-score   support

          0       0.78      0.97      0.87     10387
          1       0.36      0.06      0.10      2978

avg / total       0.69      0.77      0.70     13365


 confussion matrix:
[[10083   304]
 [ 2805   173]]


In [27]:
## Modele 3:
from sklearn.model_selection import GridSearchCV
from sklearn import grid_search
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV
parameters = {'n_estimators':[150,200,300]}
rfc= RandomForestClassifier()
m0=grid_search.GridSearchCV(rfc,parameters,cv=6)
m0 = m0.fit(X_train,y_train)



In [28]:
y_pred = m0.predict(X_test)
print y_pred

[0 0 0 ..., 0 0 0]


In [29]:
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print('\n classification_report:\n'+ str(classification_report(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

accuracy score : 0.768724279835

 classification_report:
             precision    recall  f1-score   support

          0       0.78      0.97      0.87     10387
          1       0.38      0.06      0.10      2978

avg / total       0.69      0.77      0.70     13365


 confussion matrix:
[[10098   289]
 [ 2802   176]]


In [71]:
## Modele 4
parameters = {'n_estimators':[300,400], 'n_jobs':[-1]}
ET= ExtraTreesClassifier()
m1=grid_search.GridSearchCV(ET,parameters,cv=6,n_jobs=-1)
m1 = m1.fit(X_train,y_train)
proba1=pd.DataFrame(y_test)
P = m1.predict_proba(X_test)[:, 1]

In [72]:
y_pred1 = m1.predict(X_test)

In [73]:
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred1)))
print('\n classification_report:\n'+ str(classification_report(y_test, y_pred1)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred1)))

accuracy score : 0.761242050131

 classification_report:
             precision    recall  f1-score   support

          0       0.78      0.96      0.86     10387
          1       0.35      0.09      0.14      2978

avg / total       0.69      0.76      0.70     13365


 confussion matrix:
[[9920  467]
 [2724  254]]


In [74]:
### Modele 5: 
parameters = {'n_estimators':[100,120,150]}
GB= GradientBoostingClassifier(max_depth=7)
m2=grid_search.GridSearchCV(GB,parameters,cv=6)
m2 = m2.fit(X_train,y_train)

In [75]:
y_pred2=m2.predict(X_test)

In [76]:
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred2)))
print('\n classification_report:\n'+ str(classification_report(y_test, y_pred2)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred2)))

accuracy score : 0.772989150767

 classification_report:
             precision    recall  f1-score   support

          0       0.78      0.98      0.87     10387
          1       0.42      0.05      0.09      2978

avg / total       0.70      0.77      0.70     13365


 confussion matrix:
[[10175   212]
 [ 2822   156]]


In [79]:
## Modele 6: 
from sklearn.feature_selection import RFE,SelectFromModel
from sklearn import linear_model,decomposition
from sklearn.pipeline import Pipeline
rdm= RandomForestClassifier(n_estimators=200, n_jobs=-1)
logistic = linear_model.LogisticRegression(solver='liblinear',C=100,n_jobs=-1)
select=SelectFromModel(logistic,threshold="median")
a0 = CalibratedClassifierCV(rdm, cv=6, method='isotonic')
m3 = Pipeline(steps=[('feature_selection', select),('regression', a0)])
m3 = m3.fit(X_train,y_train)

In [84]:
y_pred3

array([0, 0, 0, ..., 0, 0, 0])

In [80]:
y_pred3 = m3.predict(X_test)

In [81]:
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred3)))
print('\n classification_report:\n'+ str(classification_report(y_test, y_pred3)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred3)))

accuracy score : 0.777029554807

 classification_report:
             precision    recall  f1-score   support

          0       0.78      1.00      0.87     10387
          1       0.00      0.00      0.00      2978

avg / total       0.60      0.78      0.68     13365


 confussion matrix:
[[10385     2]
 [ 2978     0]]


In [85]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X_insee)

array([[ -1.00000000e+00,   5.94300000e+04,   1.50000000e+01, ...,
          4.70000000e+01,   1.00000000e+00,   1.00000000e+00],
       [  4.56000000e+02,   3.80000000e+04,   2.00000000e+00, ...,
          4.70000000e+01,   1.00000000e+00,   1.00000000e+00],
       [  8.68000000e+02,   9.21500000e+04,   2.00000000e+00, ...,
          5.90000000e+01,   1.00000000e+00,   1.00000000e+00],
       ..., 
       [  8.68000000e+02,   6.94000000e+04,   1.60000000e+01, ...,
          2.00000000e+01,   1.00000000e+00,   1.00000000e+00],
       [  3.07000000e+02,   9.90000000e+01,   6.00000000e+00, ...,
          6.50000000e+01,   1.00000000e+00,   4.00000000e+00],
       [  8.68000000e+02,   7.80000000e+04,   1.20000000e+01, ...,
          1.90000000e+01,   9.00000000e+00,   1.00000000e+00]])

In [86]:
X_insee.shape

(44548, 16)

In [88]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_insee, y_insee)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X_insee)


In [94]:
X_new = pd.DataFrame(X_new)

In [101]:
X_insee.head(1)

Unnamed: 0_level_0,campagne,code_postal_naissance,csp,flag_banque_principale,flag_epargne,patrimoine,regime_matrimonial,revenus_annuels,Code_postal,Salaire_net_horaire_moyen_en_2014_euro,Parrain,domain,Sexe,AGE,country_connexion,pays_de_naissance
contactid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6546762003,-1.0,59430.0,15.0,1.0,0.0,-1.0,2.0,2.0,-2.0,-2.0,0,-2.0,-1,47.0,1.0,1.0


In [102]:
X_new.head(1) ## Code postal naissance and flag epargne removed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-1.0,15.0,1.0,0.0,-1.0,2.0,2.0,-2.0,0.0,-2.0,-1.0,47.0,1.0,1.0


In [96]:
X_train, X0_test, y0_train, y0_test = train_test_split(X_new, y_insee, test_size=0.30, random_state=54)

In [97]:
# Modèle 1: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 0.1)
KNC.fit(X_train, y_train)

NearestCentroid(metric='euclidean', shrink_threshold=0.1)

In [98]:
Y = KNC.predict(X0_test)

In [99]:
print ('accuracy score : '+ str(accuracy_score(y_test, Y)))
print('\n classification_report:\n'+ str(classification_report(y_test, Y)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, Y)))

accuracy score : 0.572764683876

 classification_report:
             precision    recall  f1-score   support

          0       0.81      0.59      0.68     10387
          1       0.26      0.50      0.34      2978

avg / total       0.68      0.57      0.61     13365


 confussion matrix:
[[6157 4230]
 [1480 1498]]


In [None]:
## opon the pickle 
#with open('Model_v2.pkl', 'rb') as f:
    pickle.dump(KNC2, f)

In [180]:
### Grid search cv
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [None]:
scores = ['precision', 'recall']

In [186]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[10, 15, 20]}

In [None]:
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

In [None]:
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred3)))
print('\n classification_report:\n'+ str(classification_report(y_test, y_pred3)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred3)))

# Proba

In [None]:
u= cible
v="proba"
probabilite = "%s_%s" %(u,v)
predicted_target = "predicted_target_" + cible