NOTEBOOK DE TEST

WITHOUT CROSS VAL

In [5]:
### Import
import sys
import csv
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, recall_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [6]:
### Load data
df_initial = pd.read_csv('result.csv', sep=',')  ##Sans encours_p et insee
del df_initial['Unnamed: 0']
df_initial.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,patrimoine,pays,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible
0,6546762003,,-1,Source URL non trouvée,,,59430,France,15,,...,-1,,FRA,2,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0
1,6742433330,,456,REC_Liens textes autopromotion 456,0,31130.0,38000,France,2,11/04/1969,...,2,FRA,FRA,4,2,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0
2,6856828837,201611.0,868,PAR_Onlin_Site _ECard,MR,92000.0,92150,France,2,11/05/1957,...,2,FRA,FRA,2,4,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1
3,6754882186,,300,Intb_Banque_Autre,1,94800.0,93420,France,2,24/06/1988,...,0,FRA,FRA,2,2,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0
4,6289817192,201611.0,300,Intb_Banque_Autre,MR,26170.0,84600,Switzerland,2,01/05/1994,...,0,FRA,FRA,0,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0


In [7]:
del df_initial['cible']

In [8]:
#Fixer encours des bons clients 
df_initial['cible_seuil_1200'] = df_initial['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_initial[['encours','cible_seuil_1200']].head()

Unnamed: 0,encours,cible_seuil_1200
0,5.0,0
1,0.0,0
2,8899.93,1
3,10.0,0
4,173.53,0


In [9]:
### Data management
def data_management(df):
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']
    
    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)

    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sex'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    #del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    del df['contactid']
    del df['pays']
    del df['debit_cb']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    df = df.dropna()
    
    features_df = df.drop('cible_seuil_1200', axis=1)
    #features_df = features_df.drop('id_dim_personne',axis=1)
    #features_df = features_df.drop('annee_mois',axis=1)
    target = df['cible_seuil_1200']
    return features_df, target, df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 10
    elif (x=='GTO'):
        return 11
    elif (x=='SEN'):
        return 12
    elif (x=='BEL'):
        return 13
    elif (x=='CHN'):
        return 14
    elif (x=='ROU'):
        return 15
    elif (x=='BRA'):
        return 16
    elif (x=='MDG'):
        return 17
    elif (x=='PRT'):
        return 18
    elif (x=='GBR'):
        return 19
    elif (x=='LBN'):
        return 20
    elif (x=='TUR'):
        return 21
    elif (x=='IND'):
        return 22
    
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [10]:
X, y, data= data_management(df_initial)

  mask = arr == x


In [11]:
## Travailler avec nouv X et y ayant id_dim_personne comme index
data = data.set_index('id_dim_personne')
y = data ['cible_seuil_1200']
X = data.drop ('cible_seuil_1200', axis = 1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=60)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26019, 16)
(11151, 16)
(26019,)
(11151,)


In [13]:
######################### NEAREST CENTROID #########################
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 0.5).fit(X_train, y_train)
y_pred = KNC.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.78      0.38      0.51      8587
          1       0.24      0.65      0.35      2564

avg / total       0.66      0.44      0.47     11151

accuracy score : 0.439601829432

 confussion matrix:
[[3226 5361]
 [ 888 1676]]


In [14]:
## NC GREADSEARCHCV ##


In [15]:
######################### RANDOM FOREST #########################
RF= RandomForestClassifier(n_estimators=200,max_depth=210).fit(X_train, y_train)
y_pred = RF.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.78      0.98      0.86      8587
          1       0.40      0.05      0.09      2564

avg / total       0.69      0.76      0.69     11151

accuracy score : 0.764057035243

 confussion matrix:
[[8388  199]
 [2432  132]]


In [22]:
### Adaboost ###
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=300).fit(X_train, y_train)
y_pred = ada.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.77      0.99      0.87      8587
          1       0.44      0.04      0.07      2564

avg / total       0.70      0.77      0.68     11151

accuracy score : 0.76773383553

 confussion matrix:
[[8461  126]
 [2464  100]]


In [23]:
### Gradient boost ###
from sklearn.ensemble import GradientBoostingClassifier
gradient = GradientBoostingClassifier(n_estimators=100,random_state=10).fit(X_train, y_train)
y_pred = gradient.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.77      0.99      0.87      8587
          1       0.48      0.03      0.05      2564

avg / total       0.71      0.77      0.68     11151

accuracy score : 0.769527396646

 confussion matrix:
[[8516   71]
 [2499   65]]


In [75]:
### KNN ###
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.77      0.87      0.82      7544
          1       0.24      0.13      0.17      2303

avg / total       0.64      0.70      0.66      9847

accuracy score : 0.698385295014

 confussion matrix:
[[6572  972]
 [1998  305]]


# COMBINAISON DE CLASSIF

# INSEE 

In [51]:
### Load data
df_initial_insee = pd.read_csv('Revenue.csv', sep=',')  
del df_initial_insee['Unnamed: 0']
df_initial_insee.head()

Unnamed: 0,contactid,annee_mois,campagne,campaign,civilite,code_postal,code_postal_naissance,country_connexion_name,csp,date_naissance,...,pays_naissance,regime_matrimonial,revenus_annuels,service,timestamp,id_dim_personne,encours,cible,Code_postal,Salaire_net_horaire_moyen_en_2014_euro
0,6546762003,,-1,Source URL non trouvée,,,59430,France,15,,...,FRA,2,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-09-21T01:28:13.601+0200,6546762003,5.0,0,,
1,6742433330,,456,REC_Liens textes autopromotion 456,0,31130.0,38000,France,2,11/04/1969,...,FRA,4,2,SFOL_AJOUTER_CB,2016-10-31T18:16:38.719+0100,6742433330,0.0,0,31130.0,17.6
2,6856828837,201611.0,868,PAR_Onlin_Site _ECard,MR,92000.0,92150,France,2,11/05/1957,...,FRA,2,4,monprofil.PROSPECT/CREER_FICHE_PROSPECT,2016-11-21T12:01:44.736+0100,6856828837,8899.93,1,92000.0,16.1
3,6754882186,,300,Intb_Banque_Autre,1,94800.0,93420,France,2,24/06/1988,...,FRA,2,2,SFOL_AJOUTER_CB,2016-11-04T10:46:58.447+0100,6754882186,10.0,0,94800.0,14.6
4,6289817192,201611.0,300,Intb_Banque_Autre,MR,26170.0,84600,Switzerland,2,01/05/1994,...,FRA,0,2,monprofil.CONTACT_MANAGEMENT/amendContactInfor...,2016-11-21T17:36:53.976+0100,6289817192,173.53,0,26170.0,11.7


In [52]:
#df_initial_insee.cible.to
df_initial_insee = df_initial_insee.apply(lambda x: pd.to_numeric(x, errors='ignore'))

In [53]:
del df_initial_insee['cible']

In [54]:
#Fixer encours des bons clients 
df_initial_insee['cible_seuil_1200'] = df_initial_insee['encours'].map(lambda x: 0 if x < 1200.00 else 1).astype(int)
df_initial_insee[['encours','cible_seuil_1200']].head()

Unnamed: 0,encours,cible_seuil_1200
0,5.0,0
1,0.0,0
2,8899.93,1
3,10.0,0
4,173.53,0


In [None]:
### Data management
def data_management(df):
    
    df['code_postal'] = df['code_postal'].replace('',-2)
    df['code_postal'] = pd.to_numeric(df['code_postal'])
    
    df['code_postal_naissance'] = pd.to_numeric(df['code_postal_naissance'])
   
    df['country_connexion_name'] = df['country_connexion_name'].replace(['',None],-2)
    
    df['date_naissance'] = df['date_naissance'].replace('','01/01/1800')
    
    df['flag_epargne'] = pd.to_numeric(df['flag_epargne'])
    
    #df['mailing_accord'] = df['mailing_accord'].replace(['',None],-2)
    #df['mailing_accord'] = pd.to_numeric(df['mailing_accord'])
    del df['mailing_accord']

    df['nature_cb'] = df['nature_cb'].replace(['',None],-2)
    df['nature_cb'] = pd.to_numeric(df['nature_cb'])
    
    df['pays_naissance'] = df['pays_naissance'].replace(['',None],-2)
    
    ### get age
    get_age(df)

    ### Get flag parrain from namesponsor
    df['Parrain'] = df['namesponsor'].map(lambda x: 0 if pd.isnull(x) else 1).astype(int)
    del df['firstnamesponsor'] 
    del df['namesponsor'] 
    
    ### Get domain from mail
    df['mail'] = df.mail.replace ([np.nan], '')
    df['domain'] = df['mail'].map(lambda x: x.split("@")[1] if x else None)
    df['domain'] = df.domain.str.lower()
    del df['mail']

    ### Get Sex from Civility
    df['Sexe'] = df['civilite'].map(get_sex)
    del df['civilite']
    
    ### Delete useless cols
    del df['annee_mois']
    del df['id_dim_temps']
    #del df['id_dim_personne']
    del df['encours']
    del df['timestamp']
    del df['service']
    del df['campaign']
    del df['contactid']
    del df['pays']
    del df['debit_cb']
    del df['code_postal']
    del df['nature_cb']
    
    ### transform to numeric when possible
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    df['country_connexion_name'] = df['country_connexion_name'].replace ([np.nan], '')
    df['domain'] = df['domain'].replace ([np.nan], '')
    df['pays_naissance'] = df['pays_naissance'].replace ([np.nan], '')
    df.loc[df['country_connexion_name'].value_counts()[df['country_connexion_name']].values < 40, 'country_connexion_name'] = 'other'
    df.loc[df['domain'].value_counts()[df['domain']].values < 40, 'domain'] = 'other'
    df.loc[df['pays_naissance'].value_counts()[df['pays_naissance']].values < 40, 'pays_naissance'] = 'other'
    
    
    ### Get country connexion name
    df['country_connexion'] = df['country_connexion_name'].map(process_country_connexion)
    del df['country_connexion_name']
    ### Get pays de naissance
    #df['pays_de_naissance'] = df['pays_naissance'].map(process_pays_naissance)
    del df['pays_naissance']
    
    ### Process the domain
    df = process_domain (df)
    
    ### Drop NaN 
    df = df.dropna()
    
    features_df = df.drop('cible_seuil_1200', axis=1)
    #features_df = features_df.drop('id_dim_personne',axis=1)
    #features_df = features_df.drop('annee_mois',axis=1)
    target = df['cible_seuil_1200']
    return features_df, target, df


def get_sex(x):
    if x == "0":
        return 1
    elif (x == "1"):
        return 0
    elif (x == "2"):
        return 0
    elif (x == "MR"):
        return 1
    elif (x == "MLE"):
        return 0
    elif (x == "MME"):
        return 0
    else :
        return -1

def get_age(X):
    from datetime import datetime
    now = datetime.now()
    X['AGE'] = X['date_naissance'].dropna()
    X['AGE'] = pd.to_datetime(X['AGE'],errors='coerce')
    X['AGE'] = X['AGE'].map(lambda x : now.year-x.year  if now.month-x.month>0 else now.year - x.year -1 )
    del X['date_naissance']
    
def process_country_connexion(x):
    
    if x == 'France':
        return 1
    elif (x == 'United Kingdom'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'Germany'):
        return 4
    elif (x == 'Switzerland'):
        return 5
    elif (x == 'Netherlands'):
        return 6
    elif (x == 'Europe'):
        return 7
    elif (x == 'United States'):
        return 8
    elif (x == 'Reunion'):
        return 9
    elif (x==-2):
        return -2
    
def process_pays_naissance(x):

    if x == 'FRA':
        return 1
    elif (x == 'ITA'):
        return 2
    elif (x == 'other'):
        return 3
    elif (x == 'DZA'):
        return 4
    elif (x == 'BEN'):
        return 5
    elif (x == 'ESP'):
        return 6
    elif (x == 'VNM'):
        return 7
    elif (x == 'DEU'):
        return 8
    elif (x == 'MAR'):
        return 9
    elif (x=='CIV'):
        return 10
    elif (x=='CMR'):
        return 11
    elif (x=='GTO'):
        return 12
    elif (x=='SEN'):
        return 13
    elif (x=='BEL'):
        return 14
    elif (x=='CHN'):
        return 15
    elif (x=='ROU'):
        return 16
    elif (x=='BRA'):
        return 17
    elif (x=='MDG'):
        return 18
    elif (x=='PRT'):
        return 19
    elif (x=='GBR'):
        return 20
    elif (x=='LBN'):
        return 21
    elif (x=='TUR'):
        return 22
    elif (x=='IND'):
        return 23
    
    
def process_domain(df):
    """
    Process the domain features
    """
    dict_ = {'yahoo.fr':0, 'hotmail.fr':1, 'hotmail.com': 2, 'gmail.com': 3, 'orange.fr': 4, 'outlook.com': 5, 'free.fr': 6, 'laposte.net': 7, 'other': 8, 'neuf.fr': 9, 'wanadoo.fr': 10, 'me.com': 11, 'ymail.com': 12, 'sfr.fr': 13, 'live.fr': 14, 'bbox.fr': 15, 'outlook.fr': 16, 'msn.com': 17, 'yahoo.com': 18, 'aol.com': 19, 'icloud.com': 20, 'cegetel.net': 21, 'club-internet.fr': 22}
    df['domain'] = df['domain'].map(dict_)
    return df

In [55]:
X_insee, y_insee, data_insee= data_management(df_initial_insee)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_insee, y_insee, test_size=0.30, random_state=60)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22974, 19)
(9847, 19)
(22974,)
(9847,)


In [59]:
######################### NEAREST CENTROID #########################
from sklearn.neighbors.nearest_centroid import NearestCentroid
KNC = NearestCentroid(metric='euclidean', shrink_threshold = 0.5).fit(X_train, y_train)
y_pred = KNC.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.79      0.41      0.54      7544
          1       0.25      0.65      0.36      2303

avg / total       0.67      0.47      0.50      9847

accuracy score : 0.466030263024

 confussion matrix:
[[3102 4442]
 [ 816 1487]]


In [60]:
######################### RANDOM FOREST #########################
RF= RandomForestClassifier(n_estimators=200,max_depth=210).fit(X_train, y_train)
y_pred = RF.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.77      0.98      0.86      7544
          1       0.46      0.05      0.09      2303

avg / total       0.70      0.76      0.68      9847

accuracy score : 0.763785924647

 confussion matrix:
[[7402  142]
 [2184  119]]


In [61]:
### Adaboost ###
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=300).fit(X_train, y_train)
y_pred = ada.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.77      0.98      0.87      7544
          1       0.49      0.05      0.09      2303

avg / total       0.71      0.77      0.69      9847

accuracy score : 0.765715446329

 confussion matrix:
[[7419  125]
 [2182  121]]


In [63]:
### Gradient boost ###
from sklearn.ensemble import GradientBoostingClassifier
gradient = GradientBoostingClassifier(n_estimators=100,random_state=10).fit(X_train, y_train)
y_pred = gradient.predict(X_test)
print(classification_report(y_test, y_pred))
print ('accuracy score : '+ str(accuracy_score(y_test, y_pred)))
print ('\n confussion matrix:\n'+ str(confusion_matrix(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.77      0.99      0.87      7544
          1       0.52      0.04      0.08      2303

avg / total       0.71      0.77      0.68      9847

accuracy score : 0.766730984056

 confussion matrix:
[[7456   88]
 [2209   94]]
