# Prédiction des clients prospects 

In [118]:
### Import
import sys
import csv
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [119]:
### Load data
df_initial = pd.read_csv('result.csv', sep=',')  ##Sans encours_p et insee
del df_initial['Unnamed: 0']
df_initial.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible
0,6546762003,,-1,Source URL non trouvée,,,59430,France,15,,...,-1,,FRA,2,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0
1,6742433330,,456,REC_Liens textes autopromotion 456,0,31130.0,38000,France,2,11/04/1969,...,2,FRA,FRA,4,2,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0
2,6856828837,201611.0,868,PAR_Onlin_Site _ECard,MR,92000.0,92150,France,2,11/05/1957,...,2,FRA,FRA,2,4,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1
3,6754882186,,300,Intb_Banque_Autre,1,94800.0,93420,France,2,24/06/1988,...,0,FRA,FRA,2,2,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0
4,6289817192,201611.0,300,Intb_Banque_Autre,MR,26170.0,84600,Switzerland,2,01/05/1994,...,0,FRA,FRA,0,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0


In [108]:
df_initial.count()

contactid                 44548
annee_mois                30217
campagne                  44183
campaign                  44183
civilite                  37791
code_postal               38577
code_postal_naissance     44219
country_connexion_name    43753
csp                       44548
date_naissance            38577
debit_cb                  10847
firstnamesponsor          19033
flag_banque_principale    44548
flag_epargne              44511
id_dim_temps              41565
mail                      38577
mailing_accord            38577
namesponsor               19033
nature_cb                 10847
patrimoine                44548
pays                      38577
pays_naissance            44219
regime_matrimonial        44548
revenus_annuels           44548
service                   44548
timestamp                 44548
id_dim_personne           44548
encours                   43554
cible                     44548
dtype: int64

In [109]:
del df_initial['cible']

In [110]:
#Fixer encours des bons clients 
df_initial['cible_seuil_1200'] = df_initial['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_initial[['encours','cible_seuil_1200']].head()

Unnamed: 0,encours,cible_seuil_1200
0,5.0,0
1,0.0,0
2,8899.93,1
3,10.0,0
4,173.53,0


In [111]:
df = df_initial[df_initial.id_dim_personne == 6598695066]
df.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible_seuil_1200
16617,6598695066,,300,Intb_Banque_Autre,0,44320,67000,France,2,05/03/1978,...,0,FRA,FRA,2,2,SFOL_AJOUTER_CB,2016-11-06T21:16:32.985+0100,6598695066,5462.85,1


In [112]:
df_initial.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible_seuil_1200
0,6546762003,,-1,Source URL non trouvée,,,59430,France,15,,...,-1,,FRA,2,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0
1,6742433330,,456,REC_Liens textes autopromotion 456,0,31130.0,38000,France,2,11/04/1969,...,2,FRA,FRA,4,2,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0
2,6856828837,201611.0,868,PAR_Onlin_Site _ECard,MR,92000.0,92150,France,2,11/05/1957,...,2,FRA,FRA,2,4,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1
3,6754882186,,300,Intb_Banque_Autre,1,94800.0,93420,France,2,24/06/1988,...,0,FRA,FRA,2,2,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0
4,6289817192,201611.0,300,Intb_Banque_Autre,MR,26170.0,84600,Switzerland,2,01/05/1994,...,0,FRA,FRA,0,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0


# DF sans parrain

In [113]:
### Data management
def data_management(df):
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']
    
    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)

    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sex'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    #del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    del df['contactid']
    del df['pays']
    del df['debit_cb']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    df = df.dropna()
    
    features_df = df.drop('cible_seuil_1200', axis=1)
    #features_df = features_df.drop('id_dim_personne',axis=1)
    #features_df = features_df.drop('annee_mois',axis=1)
    target = df['cible_seuil_1200']
    return features_df, target, df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 10
    elif (x=='GTO'):
        return 11
    elif (x=='SEN'):
        return 12
    elif (x=='BEL'):
        return 13
    elif (x=='CHN'):
        return 14
    elif (x=='ROU'):
        return 15
    elif (x=='BRA'):
        return 16
    elif (x=='MDG'):
        return 17
    elif (x=='PRT'):
        return 18
    elif (x=='GBR'):
        return 19
    elif (x=='LBN'):
        return 20
    elif (x=='TUR'):
        return 21
    elif (x=='IND'):
        return 22
    
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [114]:
X, y, data= data_management(df_initial)

In [115]:
data.id_dim_personne.count()

37170

In [11]:
#data = data.set_index('id_dim_personne')
#data = data.drop(['id_dim_personne'])
#data.reset_index(level=0, inplace=True)
#del data['index']
data.head()

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,id_dim_personne,cible_seuil_1200,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance
1,456,31130,38000,2,0,0,1,2,4,2,6742433330,0,47,0,0,1,1,1
2,868,92000,92150,2,1,0,-2,2,2,4,6856828837,1,59,0,6,1,1,1
3,300,94800,93420,2,0,0,1,0,2,2,6754882186,0,28,0,1,0,1,1
4,300,26170,84600,2,0,0,-2,0,0,2,6289817192,0,23,0,3,1,5,1
5,307,73150,92290,2,0,0,1,3,2,2,6748699778,0,54,0,2,1,1,1


In [12]:
test = data[data.id_dim_personne == 6856828837]
test
#X = X.set_index('id_dim_personne')

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,id_dim_personne,cible_seuil_1200,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance
2,868,92000,92150,2,1,0,-2,2,2,4,6856828837,1,59,0,6,1,1,1


In [13]:
## Travailler avec nouv X et y ayant id_dim_personne comme index
data = data.set_index('id_dim_personne')
X = data.drop ('cible_seuil_1200', axis = 1)
y = data ['cible_seuil_1200']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=54)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26019, 16)
(11151, 16)
(26019,)
(11151,)


In [15]:
X_test.dtypes

campagne                  float64
code_postal               float64
code_postal_naissance     float64
csp                       float64
flag_banque_principale    float64
flag_epargne              float64
nature_cb                 float64
patrimoine                float64
regime_matrimonial        float64
revenus_annuels           float64
AGE                       float64
Parrain                     int64
domain                    float64
Sex                         int64
country_connexion         float64
pays_de_naissance         float64
dtype: object

In [16]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 0.5)
KNC.fit(X, y)

NearestCentroid(metric='euclidean', shrink_threshold=0.5)

In [28]:
# Concéténer X et y test en fct de l'id_dim_personne
df_total = pd.concat([X, y], axis=1)
df_total.head()

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
6742433330,456,31130,38000,2,0,0,1,2,4,2,47,0,0,1,1,1,0
6856828837,868,92000,92150,2,1,0,-2,2,2,4,59,0,6,1,1,1,1
6754882186,300,94800,93420,2,0,0,1,0,2,2,28,0,1,0,1,1,0
6289817192,300,26170,84600,2,0,0,-2,0,0,2,23,0,3,1,5,1,0
6748699778,307,73150,92290,2,0,0,1,3,2,2,54,0,2,1,1,1,0


In [29]:
df_total["nouvelle_cible_predit"] = KNC.predict(X)
#df_total["cible_predit_sans_encoursParrain_sans_insee"] = KNC.predict(X)
df_total

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200,nouvelle_cible_predit
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6742433330,456,31130,38000,2,0,0,1,2,4,2,47,0,0,1,1,1,0,1
6856828837,868,92000,92150,2,1,0,-2,2,2,4,59,0,6,1,1,1,1,1
6754882186,300,94800,93420,2,0,0,1,0,2,2,28,0,1,0,1,1,0,1
6289817192,300,26170,84600,2,0,0,-2,0,0,2,23,0,3,1,5,1,0,1
6748699778,307,73150,92290,2,0,0,1,3,2,2,54,0,2,1,1,1,0,1
6856855432,307,75015,75015,18,1,0,-2,0,0,1,24,0,3,1,1,1,0,1
6821231675,868,69001,69008,3,1,0,0,0,0,1,27,1,3,0,1,1,1,1
6747566365,868,75015,42000,2,0,1,1,0,0,2,28,1,3,0,1,1,0,0
6888675097,1543,59161,59300,12,0,0,-2,0,0,0,20,0,3,0,1,1,0,1
6879474578,300,17810,67000,16,0,0,-2,1,2,3,36,0,1,1,1,1,0,1


In [31]:
print(classification_report(y, df_total.nouvelle_cible_predit))
print ('accuracy score : '+ str(accuracy_score(y, df_total.nouvelle_cible_predit)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y, df_total.nouvelle_cible_predit)))

             precision    recall  f1-score   support

          0       0.78      0.37      0.50     28766
          1       0.23      0.65      0.34      8404

avg / total       0.66      0.43      0.46     37170

accuracy score : 0.429136400323

 confussion matrix:
[[10516 18250]
 [ 2969  5435]]


In [22]:
test = df_total[df_total.id_dim_personne == 7500964054]
test

AttributeError: 'DataFrame' object has no attribute 'id_dim_personne'

In [None]:
#################### Test sur un seul contact
data_t = data[data.id_dim_personne != 6856828837]
X_t = data_t.drop('cible' ,axis=1)
y_t = data_t.cible
data_test = data[data.id_dim_personne == 6856828837]
data_test = data_test.reset_index()
del data_test['index']
data_test
####################

In [331]:
### Modèle 1 : RDF
classifier= RandomForestClassifier(n_estimators=200,max_depth=210)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=210, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [332]:
df_total["cible_predit_sans_encoursParrain_sans_insee_2"] = classifier.predict(X_test)
df_total.head()

Unnamed: 0,level_0,index,id_dim_personne,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,...,revenus_annuels,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_seuil_1200,cible_predit_sans_encoursParrain_sans_insee_2,cible_predit_sans_encoursParrain_sans_insee
0,0,0,7024879754,456,75015,75014,2,0,1,-2,...,2,28,1,8,1,1,1,1,0,0
1,1,1,7439619676,300,77000,99,15,0,0,-2,...,1,26,0,3,1,1,12,0,0,0
2,2,2,6704851794,868,77176,93200,12,0,1,1,...,2,23,1,3,-1,1,1,0,1,0
3,3,3,7633259988,868,95110,27200,4,1,1,-2,...,1,25,1,3,0,1,1,0,0,0
4,4,4,6598695066,300,44320,67000,2,1,1,1,...,2,39,0,2,1,1,1,1,1,0


In [333]:
print ('accuracy score : '+ str(accuracy_score(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee)))
print('classification_report:'+ str(classification_report(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, df_total.cible_predit_sans_encoursParrain_sans_insee)))

accuracy score : 0.767913191642
classification_report:             precision    recall  f1-score   support

          0       0.78      0.98      0.87      8607
          1       0.44      0.06      0.10      2544

avg / total       0.70      0.77      0.69     11151


 confussion matrix:
[[8415  192]
 [2396  148]]


In [23]:
df_total = df_total.reset_index()

### Indexing in ES

In [32]:
__author__ = 'ubuntu'

# Cassandra metadata
dev_cassandra_host= 'dtl-cassandra01-d01'
dev_cassandra_port= 9200
dev_username= 'loaddata'
dev_password= 'DecujRiQuigByaibdednofVerr6Odij2'

# Elasticsearch hostname
dev_es_host= 'dtl-esmaster01-d01'

# Elasticsearch port
dev_es_port= 9200
es_login= 'dtl-spark'
es_password= 'taquivvukyuztAckufneglugfisipBio'
    
# Path to pickle one month 'encours' and csv matching files
path_to_one_month_pickle = "./MODEL/ENCOURS_1MOIS_v2/"

# One month model target name
one_month_target_name = 'ENCOURS_1MOIS_v2'

In [33]:
df_total = df_total.reset_index()

In [35]:
df_to_be_indexed = df_total[['id_dim_personne','cible_seuil_1200','nouvelle_cible_predit']]
print(df_to_be_indexed)

       id_dim_personne  cible_seuil_1200  nouvelle_cible_predit
0           6742433330                 0                      1
1           6856828837                 1                      1
2           6754882186                 0                      1
3           6289817192                 0                      1
4           6748699778                 0                      1
5           6856855432                 0                      1
6           6821231675                 1                      1
7           6747566365                 0                      0
8           6888675097                 0                      1
9           6879474578                 0                      1
10          6821501450                 0                      0
11          6873248390                 1                      0
12          6901251821                 0                      1
13          6946289020                 0                      1
14          6879474016                 0

In [36]:
df_es = df_to_be_indexed[df_to_be_indexed.id_dim_personne == 7501471010]
df_es

Unnamed: 0,id_dim_personne,cible_seuil_1200,nouvelle_cible_predit
29049,7501471010,0,0


In [346]:
#Insert into es v2
es.update_index_retro(df_to_be_indexed, 'retro', 'retro', 'id_dim_personne')

NameError: name 'es' is not defined

In [43]:
#df = df_to_be_indexed[df_to_be_indexed.id_dim_personne [7500945441, 7500854058,7501227214] ]

KeyError: (7500945441, 7500854058, 7501227214)

In [130]:
### Timestamp to be indexde
df_init = pd.read_csv('result.csv', sep=',')
del df_init ["Unnamed: 0"]
df_timestamp = df_init[['annee_mois','contactid']]
df_timestamp.head()

Unnamed: 0,annee_mois,contactid
0,,6546762003
1,,6742433330
2,201611.0,6856828837
3,,6754882186
4,201611.0,6289817192


In [25]:
test = data[data.id_dim_personne== 7500964054]
test

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,id_dim_personne,cible,AGE,Parrain,domain,Sex,country_connexion,pays_de_naissance,cible_predit_sans_encoursParrain_sans_insee,y_pred
41788,868,93600,91130,13,1,1,-2,1,1,2,7500964054,1,35,1,3,1,-2,1,1,1


In [46]:
### Create ES Instance ###
from elasticsearch import Elasticsearch
import math

class ElasticsearchClient:

    def __init__(self):
        self.host = 'dtl-esmaster01-d01'
        self.port = 9200
        self.user = 'dtl-spark'
        self.secret = 'taquivvukyuztAckufneglugfisipBio'
        self.session = None

    def create_session(self):
        self.session = Elasticsearch(self.host,
                                     http_auth=(self.user, self.secret),
                                     use_ssl=True,
                                     verify_certs=False,
                                     sniff_on_start=True,
                                     sniff_on_connection_fail=True,
                                     sniffer_timeout=60)

    def index_purgatory(self,index):
        self.session.indices.delete(index=index)

    def index_table(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            index_stmt = self.session.index(index=index_name, doc_type=doc_type, body=data_dict, id=data_dict[doc_id])

    def convert_float(self, number):
        if math.isnan(number) :
            return '201611'
        else :
            return str(int(number))

    def update_index(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            row['annee_mois'] = self.convert_float(row['annee_mois'])
            #del row['annee_mois']
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            update_stmt = self.session.update(index=index_name + row['annee_mois'], doc_type=doc_type, body = {"doc": data_dict, "doc_as_upsert":True} , id=data_dict[doc_id]) #, "index.mapping.ignore_malformed":True
            ###es.update.retry.on.conflict

    def update_index_retro(self,table,index_name,doc_type,doc_id):
        for index, row in table.iterrows():
            data_dict = {}
            for i in range(len(row)):
                data_dict[table.columns[i]] = row[i]
            update_stmt = self.session.update(index=index_name, doc_type=doc_type, body = {"doc": data_dict, "doc_as_upsert":True} , id=data_dict[doc_id]) #, "index.mapping.ignore_malformed":True
            ###es.update.retry.on.conflict

    def create_indices(self):
        try:
            self.session.indices.create(index='index', ignore=400)
        except elasticsearch.ElasticsearchException as es1:
            print('error es')





In [47]:
### Create ES Instance ###
es = ElasticsearchClient()
es.create_session()

  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)
  'Connecting to %s using SSL with verify_certs=False is insecure.' % host)


In [29]:
#Insert into es v2
es.update_index_retro(df_es, 'retro', 'retro', 'id_dim_personne')

  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)


In [74]:
es = ElasticsearchClient()


NameError: global name 'config' is not defined

# Avec parrain

In [120]:
# df_init_p = pd.read_csv('df_new_initial.csv', sep=',') 
df_init_p = pd.read_csv('result.csv', sep=',') 
del df_init_p ['Unnamed: 0']
del df_init_p ['cible']
df_init_p.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,nature_cb,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours
0,6546762003,,-1,Source URL non trouvée,,,59430,France,15,,...,,-1,,FRA,2,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0
1,6742433330,,456,REC_Liens textes autopromotion 456,0,31130.0,38000,France,2,11/04/1969,...,1.0,2,FRA,FRA,4,2,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0
2,6856828837,201611.0,868,PAR_Onlin_Site _ECard,MR,92000.0,92150,France,2,11/05/1957,...,,2,FRA,FRA,2,4,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93
3,6754882186,,300,Intb_Banque_Autre,1,94800.0,93420,France,2,24/06/1988,...,1.0,0,FRA,FRA,2,2,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0
4,6289817192,201611.0,300,Intb_Banque_Autre,MR,26170.0,84600,Switzerland,2,01/05/1994,...,,0,FRA,FRA,0,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53


In [121]:
df_init_p.contactid.count()

44548

In [122]:
#Fixer encours des bons clients 
df_init_p['cible'] = df_init_p['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_init_p[['encours','cible']].head()

Unnamed: 0,encours,cible
0,5.0,0
1,0.0,0
2,8899.93,1
3,10.0,0
4,173.53,0


In [96]:
### Data management
def data_management_p(df):
    
    ### Fill empty value
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']
    
    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)
    

    ### Get flag parrain from namesponsor
    #df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sex'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']

    del df['pays']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    df = df.dropna()
    
    return df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 10
    elif (x=='GTO'):
        return 11
    elif (x=='SEN'):
        return 12
    elif (x=='BEL'):
        return 13
    elif (x=='CHN'):
        return 14
    elif (x=='ROU'):
        return 15
    elif (x=='BRA'):
        return 16
    elif (x=='MDG'):
        return 17
    elif (x=='PRT'):
        return 18
    elif (x=='GBR'):
        return 19
    elif (x=='LBN'):
        return 20
    elif (x=='TUR'):
        return 21
    elif (x=='IND'):
        return 22
    
def process_domain(x):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    x['domain'] = x['domain'].map(dict_)
    return x

In [123]:
### Data management
def data_management(df):
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']
    
    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)

    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sex'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    del df['contactid']
    del df['pays']
    del df['debit_cb']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    #df = df.dropna()
    
    #features_df = df.drop('cible_seuil_1200', axis=1)
    #features_df = features_df.drop('id_dim_personne',axis=1)
    #features_df = features_df.drop('annee_mois',axis=1)
    #target = df['cible_seuil_1200']
    return df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 10
    elif (x=='GTO'):
        return 11
    elif (x=='SEN'):
        return 12
    elif (x=='BEL'):
        return 13
    elif (x=='CHN'):
        return 14
    elif (x=='ROU'):
        return 15
    elif (x=='BRA'):
        return 16
    elif (x=='MDG'):
        return 17
    elif (x=='PRT'):
        return 18
    elif (x=='GBR'):
        return 19
    elif (x=='LBN'):
        return 20
    elif (x=='TUR'):
        return 21
    elif (x=='IND'):
        return 22
    
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [124]:
data_p = data_management_p(df_init_p)

In [125]:
data_p.contactid.count()

10047

In [90]:
Parrain_df = pd.read_csv('parrain_totale_afteradd14.csv', sep=',')
del Parrain_df['Unnamed: 0']
Parrain_df.head()

Unnamed: 0,id_dim_personne,encours_parrain,id_parrain
0,1004250808,2475.93,224108008.0
1,1660202716,,
2,1715100837,2736.14,1697958997.0
3,1725735525,,
4,1748840201,1311.33,814632101.0


In [81]:
Parrain_df.id_dim_personne

0        1004250808
1        1660202716
2        1715100837
3        1725735525
4        1748840201
5        1766903833
6        1887361163
7        1989548372
8        2232679265
9        2260529045
10       2364644331
11       2381429055
12       2997328175
13       3294911605
14       3491258034
15       3705902288
16       3878342197
17       4141010655
18       4223249842
19       4252943190
20       4253839485
21       4406550122
22       4409310817
23       4428098087
24       4544429677
25       4571148440
26       4613670658
27       4881973309
28       4899596466
29       4900569192
            ...    
44523    7641545691
44524    7641546336
44525    7641551329
44526    7641552635
44527    7641555335
44528    7641557844
44529    7641568813
44530    7641570793
44531    7641570942
44532    7641571763
44533    7641573352
44534    7641573440
44535    7641574301
44536    7641574586
44537    7641576053
44538    7641576965
44539    7641577491
44540    7641577696
44541    7641579602


In [82]:
# Merge parrain id with DF initial selon l'id client
enriched_df = pd.merge(data_p, Parrain_df, right_on='id_dim_personne', left_on='contactid')
enriched_df.head()

Unnamed: 0,contactid,campagne,code_postal,code_postal_naissance,csp,debit_cb,flag_banque_principale,flag_epargne,nature_cb,patrimoine,...,revenus_annuels,cible,AGE,domain,Sex,country_connexion,pays_de_naissance,id_dim_personne,encours_parrain,id_parrain
0,6742433330,456,31130,38000,2,0,0,0,1,2,...,2,0,47,0,1,1,1,6742433330,19144.72,5516969255.0
1,6754882186,300,94800,93420,2,0,0,0,1,0,...,2,0,28,1,0,1,1,6754882186,,
2,6748699778,307,73150,92290,2,0,0,0,1,3,...,2,0,54,2,1,1,1,6748699778,,
3,6821231675,868,69001,69008,3,2,1,0,0,0,...,1,1,27,3,0,1,1,6821231675,1028.36,2306432202.0
4,6747566365,868,75015,42000,2,0,0,1,1,0,...,2,0,28,3,0,1,1,6747566365,587.07,4942455827.0


In [83]:
enriched_df.contactid

0        6742433330
1        6754882186
2        6748699778
3        6821231675
4        6747566365
5        6701524556
6        6738608856
7        5648776005
8        6805307194
9        6727392192
10       6784931163
11       6799124220
12       6778238415
13       6499903453
14       6778096615
15       5933080292
16       6742606078
17       6761276433
18       6772470451
19       6704949188
20       6808456428
21       6774120999
22       5624898801
23       5943855768
24       6798952412
25       4813182747
26       6772184537
27       6778615240
28       6820536417
29       6833738674
            ...    
10017    6701946487
10018    6840341460
10019    6772440941
10020    6768667517
10021    6269876389
10022    6710291599
10023    6772130831
10024    6738472005
10025    6416515483
10026    6835174291
10027    6821421265
10028    6742440269
10029    6701483326
10030    6727682321
10031    6840403461
10032    6840400496
10033    6742724479
10034    6834976889
10035    6737168521


In [56]:
### Data management parrain
def data_management_parrain(df):
    
    ### Get flag parrain from id parrain
    df['Parrain'] = df['id_parrain'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['id_parrain']
    

    ### Get flag encours_parrain (1000) from encours_parrain 
    df['encours_p'] = df['encours_parrain'].map(lambda x : 0 if x<1500 or pd.isnull(x) else 1).astype(int)
    del df['encours_parrain']
    
    ### USELESS COLUMNS
    
    del df['contactid']
    #del df['id_dim_personne'] 
    
    #features_df = df.drop('cible', axis=1)
    #target = df['cible']
    
    #return features_df, target
    return df


In [57]:
#X_parrain, y_parrain = data_management_parrain(enriched_df)
df_parrain = data_management_parrain(enriched_df)

In [58]:
df_parrain.head()

Unnamed: 0,campagne,code_postal,code_postal_naissance,csp,debit_cb,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,cible,AGE,domain,Sex,country_connexion,pays_de_naissance,id_dim_personne,Parrain,encours_p
0,456,31130,38000,2,0,0,0,1,2,4,2,0,47,0,1,1,1,6742433330,1,1
1,300,94800,93420,2,0,0,0,1,0,2,2,0,28,1,0,1,1,6754882186,0,0
2,307,73150,92290,2,0,0,0,1,3,2,2,0,54,2,1,1,1,6748699778,0,0
3,868,69001,69008,3,2,1,0,0,0,0,1,1,27,3,0,1,1,6821231675,1,0
4,868,75015,42000,2,0,0,1,1,0,0,2,0,28,3,0,1,1,6747566365,1,0


In [59]:
df_parrain = df_parrain.set_index('id_dim_personne')

In [60]:
X_parrain = df_parrain.drop('cible', axis=1)
y_parrain = df_parrain['cible']

In [61]:
X02_train, X02_test, y02_train, y02_test = train_test_split (X_parrain,y_parrain,test_size=0.3,random_state=57)
#X02_train, X02_test, y02_train, y02_test = train_test_split (X_parrain,y_parrain,random_state=42)

In [62]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC2 = NearestCentroid(metric='euclidean', shrink_threshold = 1.5)
KNC2.fit(X02_train, y02_train)


NearestCentroid(metric='euclidean', shrink_threshold=1.5)

In [63]:
df_concat = pd.concat([X02_test, y02_test], axis=1)

In [64]:
df_concat['cible_predit_avec_encoursParrain'] = KNC2.predict(X02_test)
df_concat 

Unnamed: 0_level_0,campagne,code_postal,code_postal_naissance,csp,debit_cb,flag_banque_principale,flag_epargne,nature_cb,patrimoine,regime_matrimonial,revenus_annuels,AGE,domain,Sex,country_connexion,pays_de_naissance,Parrain,encours_p,cible,cible_predit_avec_encoursParrain
id_dim_personne,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6827320758,1543,93240,93300,4,2,0,0,0,0,2,1,30,1,1,1,1,0,0,0,0
6716055371,868,94160,34000,12,2,0,1,0,0,4,1,28,3,-1,1,1,1,1,1,0
6814374196,307,3450,6000,6,5,1,1,0,0,1,0,66,14,1,1,1,0,0,0,1
6786451131,456,72100,85000,2,0,0,0,1,0,4,2,39,3,1,1,1,0,0,0,1
6727462381,300,69340,76130,15,0,1,0,1,0,0,2,28,1,1,1,1,0,0,1,1
6084867253,868,94120,75014,2,0,0,1,1,0,0,2,25,8,1,4,1,1,1,1,0
6546341247,300,25640,25000,16,1,1,1,0,1,6,1,28,3,0,1,1,0,0,0,1
6826851957,1543,44690,16470,16,2,0,0,0,0,4,1,36,14,1,1,1,0,0,0,0
6742505207,456,35000,22000,13,0,1,1,1,2,2,2,60,6,1,1,1,0,0,1,1
6772285768,304,75019,75017,16,5,0,0,0,0,4,2,26,3,1,1,1,0,0,0,1


In [65]:
print(classification_report(y02_test, df_concat.cible_predit_avec_encoursParrain))
print ('accuracy score : '+ str(accuracy_score(y02_test, df_concat.cible_predit_avec_encoursParrain)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y02_test, df_concat.cible_predit_avec_encoursParrain)))

             precision    recall  f1-score   support

          0       0.81      0.66      0.72      2318
          1       0.29      0.48      0.36       697

avg / total       0.69      0.61      0.64      3015

accuracy score : 0.614593698176

 confussion matrix:
[[1521  797]
 [ 365  332]]


In [132]:
# Modèle 1 : RDF
forest_b= RandomForestClassifier(n_estimators=200,max_depth=201)
forest_b.fit(X02_train, y02_train)
y_pred_b = forest_b.predict(X02_test)
print ('accuracy score : '+ str(accuracy_score(y02_test, y_pred_b)))
print('\n classification_report:\n'+ str(classification_report(y02_test, y_pred_b)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y02_test, y_pred_b)))

accuracy score : 0.750398089172

 classification_report:
             precision    recall  f1-score   support

          0       0.77      0.96      0.85      1886
          1       0.50      0.11      0.18       626

avg / total       0.70      0.75      0.69      2512


 confussion matrix:
[[1815   71]
 [ 556   70]]


In [36]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC2 = NearestCentroid(metric='euclidean', shrink_threshold = 0.5)
KNC2.fit(X02_train, y02_train)

NearestCentroid(metric='euclidean', shrink_threshold=0.5)

In [105]:
## save the model to disk
import pickle
with open('Model_v2.pkl', 'wb') as f:
    pickle.dump(KNC2, f)

In [66]:
df_concat.reset_index(level=0, inplace=True)

In [68]:
df_concat.head()

Unnamed: 0,id_dim_personne,campagne,code_postal,code_postal_naissance,csp,debit_cb,flag_banque_principale,flag_epargne,nature_cb,patrimoine,...,revenus_annuels,AGE,domain,Sex,country_connexion,pays_de_naissance,Parrain,encours_p,cible,cible_predit_avec_encoursParrain
0,6827320758,1543,93240,93300,4,2,0,0,0,0,...,1,30,1,1,1,1,0,0,0,0
1,6716055371,868,94160,34000,12,2,0,1,0,0,...,1,28,3,-1,1,1,1,1,1,0
2,6814374196,307,3450,6000,6,5,1,1,0,0,...,0,66,14,1,1,1,0,0,0,1
3,6786451131,456,72100,85000,2,0,0,0,1,0,...,2,39,3,1,1,1,0,0,0,1
4,6727462381,300,69340,76130,15,0,1,0,1,0,...,2,28,1,1,1,1,0,0,1,1


In [69]:
df_to_be_indexed = df_concat[['id_dim_personne','cible_predit_avec_encoursParrain']]
print(df_to_be_indexed)

      id_dim_personne  cible_predit_avec_encoursParrain
0          6827320758                                 0
1          6716055371                                 0
2          6814374196                                 1
3          6786451131                                 1
4          6727462381                                 1
5          6084867253                                 0
6          6546341247                                 1
7          6826851957                                 0
8          6742505207                                 1
9          6772285768                                 1
10         6742381651                                 0
11         6835050168                                 0
12         6569819873                                 1
13         6727497338                                 1
14         6835079003                                 0
15         6834822846                                 1
16         6754071465                           

In [None]:
df_es = df_to_be_indexed[df_to_be_indexed.id_dim_personne == 7501471010]
df_es

In [48]:
#Insert into es v2
es.update_index_retro(df_to_be_indexed, 'retro', 'retro', 'id_dim_personne')

  if param in SKIP_IN_PATH:
  quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH)


In [151]:
# Modèle 3: nearest_centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC2 = NearestCentroid(metric='euclidean', shrink_threshold = 0.5)
KNC2.fit(X, y)
y31_pred = KNC2.predict(X)

print(classification_report(y, y31_pred))
print ('accuracy score : '+ str(accuracy_score(y, y31_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y, y31_pred)))

             precision    recall  f1-score   support

          0       0.80      0.42      0.55     28766
          1       0.24      0.64      0.35      8404

avg / total       0.68      0.47      0.51     37170

accuracy score : 0.469814366425

 confussion matrix:
[[12058 16708]
 [ 2999  5405]]


In [147]:
# Modèle 1 : RDF
forest_b= RandomForestClassifier(n_estimators=200,max_depth=201)
forest_b.fit(X, y)
y_pred_b = forest_b.predict(X)
print ('accuracy score : '+ str(accuracy_score(y, y_pred_b)))
print('\n classification_report:\n'+ str(classification_report(y, y_pred_b)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y, y_pred_b)))

accuracy score : 1.0

 classification_report:
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     28766
          1       1.00      1.00      1.00      8404

avg / total       1.00      1.00      1.00     37170


 confussion matrix:
[[28766     0]
 [    0  8404]]


# Proba

In [None]:
u= cible
v="proba"
probabilite = "%s_%s" %(u,v)
predicted_target = "predicted_target_" + cible