# DNN Classification with Tensorflow

## Prepare the data

### Load the dataset

In [1]:
import pandas as pd

In [2]:
# read a CSV file, french style
df = pd.read_csv('Churn.csv',sep=',')

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Split into training & testing data

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# Separete the input data and the value to predict (the 'Exited' column).
# By convention: X is the input data and Y is the value to predict.
x_data = df.drop(['Exited'],axis=1)
y_val = df['Exited']

In [6]:
# Separate the dataset in two parts:
# 70% of the data will be used for training the model
# 30% to test it
X_train, X_test, y_train, y_test = train_test_split(x_data,y_val,test_size=0.3,random_state=101)

In [7]:
X_train.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,5012.097857,15691270.0,651.217857,38.766714,5.010714,77146.268254,1.527857,0.709,0.510429,100066.450221
std,2885.513274,71793.97,96.670607,10.484941,2.89755,62317.167188,0.582961,0.454256,0.499927,57764.876058
min,1.0,15565710.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58
25%,2529.75,15628690.0,584.0,32.0,2.0,0.0,1.0,0.0,0.0,50910.6775
50%,5008.5,15691760.0,652.0,37.0,5.0,97572.74,1.0,1.0,1.0,100600.355
75%,7513.25,15753360.0,719.0,44.0,8.0,127894.435,2.0,1.0,1.0,149539.85
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199970.74


### Scale the data

In [8]:
cols_to_scale=['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']

In [9]:
# Get around a bug in Sklearn / Pandas communication
# see https://stackoverflow.com/questions/45090639/pandas-shows-settingwithcopywarning-after-train-test-split
X_train.is_copy=None
X_test.is_copy=None

In [10]:
X_train.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
803,804,15712825,Howells,511,Spain,Female,29,9,0.0,2,0,1,140676.98
1387,1388,15674526,Byrne,725,France,Male,66,4,86459.8,1,1,1,141476.56
921,922,15743411,Chiawuotu,609,Spain,Male,61,1,0.0,1,1,0,22447.85
5917,5918,15812184,Rose,674,France,Female,31,1,0.0,1,1,0,128954.05
9610,9611,15812534,Chiemenam,455,France,Male,40,1,0.0,3,0,1,129975.34


In [11]:
for col in cols_to_scale:
    xmin=X_train[[col]].min().values[0]
    dx=X_train[col].max()-xmin
    X_train[[col]] = X_train[[col]].apply(lambda x: ((x - xmin) / dx))
    X_test[[col]] = X_test[[col]].apply(lambda x: ((x - xmin) / dx))

In [12]:
X_train.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
803,804,15712825,Howells,0.322,Spain,Female,0.148649,0.9,0.0,0.333333,0,1,0.703471
1387,1388,15674526,Byrne,0.75,France,Male,0.648649,0.4,0.344601,0.0,1,1,0.707469
921,922,15743411,Chiawuotu,0.518,Spain,Male,0.581081,0.1,0.0,0.0,1,0,0.112204
5917,5918,15812184,Rose,0.648,France,Female,0.175676,0.1,0.0,0.0,1,0,0.644844
9610,9611,15812534,Chiemenam,0.21,France,Male,0.297297,0.1,0.0,0.666667,0,1,0.649952


In [14]:
X_train.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,5012.097857,15691270.0,0.602436,0.280631,0.501071,0.30748,0.175952,0.709,0.510429,0.500377
std,2885.513274,71793.97,0.193341,0.141688,0.289755,0.248376,0.19432,0.454256,0.499927,0.288883
min,1.0,15565710.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2529.75,15628690.0,0.468,0.189189,0.2,0.0,0.0,0.0,0.0,0.254547
50%,5008.5,15691760.0,0.604,0.256757,0.5,0.388894,0.0,1.0,1.0,0.503047
75%,7513.25,15753360.0,0.738,0.351351,0.8,0.509747,0.333333,1.0,1.0,0.747794
max,10000.0,15815690.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Create the tensorflow estimator

In [15]:
import tensorflow as tf

### Create the 'feature columns'

In [16]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [17]:
# create the numeric columns
age = tf.feature_column.numeric_column('Age')
creditscore = tf.feature_column.numeric_column('CreditScore')
tenure = tf.feature_column.numeric_column('Tenure')
balance = tf.feature_column.numeric_column('Balance')
numproducts = tf.feature_column.numeric_column('NumOfProducts')
hascard = tf.feature_column.numeric_column('HasCrCard')
isactive = tf.feature_column.numeric_column('IsActiveMember')
estimated = tf.feature_column.numeric_column('EstimatedSalary')

In [26]:
# create categorical columns
geography = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key='Geography',vocabulary_list=df['Geography'].unique()))
gender = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key='Gender',vocabulary_list=df['Gender'].unique()))

In [27]:
feat_cols = [age,creditscore,tenure,balance,numproducts,hascard,isactive,estimated,geography,gender ]

### Create the input function

In [28]:
input_fn = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=1000,shuffle=True)

### Create the estimator and train it

In [37]:
model = tf.estimator.DNNClassifier(hidden_units=[10],feature_columns=feat_cols,n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': 1, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_model_dir': '/var/folders/2c/gpl2fhbx4kgft314vh0y0qq80000gn/T/tmphxsowu67', '_log_step_count_steps': 100}


In [38]:
model.train(input_fn=input_fn,steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/2c/gpl2fhbx4kgft314vh0y0qq80000gn/T/tmphxsowu67/model.ckpt.
INFO:tensorflow:step = 1, loss = 7.02812
INFO:tensorflow:global_step/sec: 221.486
INFO:tensorflow:step = 101, loss = 3.21322 (0.453 sec)
INFO:tensorflow:global_step/sec: 245.767
INFO:tensorflow:step = 201, loss = 5.41097 (0.407 sec)
INFO:tensorflow:global_step/sec: 253.437
INFO:tensorflow:step = 301, loss = 2.7585 (0.394 sec)
INFO:tensorflow:global_step/sec: 252.995
INFO:tensorflow:step = 401, loss = 6.56773 (0.402 sec)
INFO:tensorflow:global_step/sec: 164.44
INFO:tensorflow:step = 501, loss = 7.16744 (0.625 sec)
INFO:tensorflow:global_step/sec: 141.167
INFO:tensorflow:step = 601, loss = 5.3903 (0.698 sec)
INFO:tensorflow:global_step/sec: 198.908
INFO:tensorflow:step = 701, loss = 4.64249 (0.495 sec)
INFO:tensorflow:global_step/sec: 203.496
INFO:tensorflow:step = 801, loss = 3.30368 (0.495 sec)
INFO:tensorflow:global_step/sec

INFO:tensorflow:global_step/sec: 241.27
INFO:tensorflow:step = 8401, loss = 1.40768 (0.412 sec)
INFO:tensorflow:global_step/sec: 194.211
INFO:tensorflow:step = 8501, loss = 1.16342 (0.515 sec)
INFO:tensorflow:global_step/sec: 247.268
INFO:tensorflow:step = 8601, loss = 1.85934 (0.404 sec)
INFO:tensorflow:global_step/sec: 226.2
INFO:tensorflow:step = 8701, loss = 1.67892 (0.442 sec)
INFO:tensorflow:global_step/sec: 257.994
INFO:tensorflow:step = 8801, loss = 6.23688 (0.389 sec)
INFO:tensorflow:global_step/sec: 259.823
INFO:tensorflow:step = 8901, loss = 3.86459 (0.388 sec)
INFO:tensorflow:global_step/sec: 242.624
INFO:tensorflow:step = 9001, loss = 1.59913 (0.407 sec)
INFO:tensorflow:global_step/sec: 256.296
INFO:tensorflow:step = 9101, loss = 2.09019 (0.397 sec)
INFO:tensorflow:global_step/sec: 242.822
INFO:tensorflow:step = 9201, loss = 1.83232 (0.405 sec)
INFO:tensorflow:global_step/sec: 133.176
INFO:tensorflow:step = 9301, loss = 4.8368 (0.761 sec)
INFO:tensorflow:global_step/sec: 1

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x11818f2e8>

### Evaluate our estimator

In [39]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      num_epochs=1,
      shuffle=False)

In [40]:
predictions = model.predict(pred_input_func)
pred_list = list(predictions)

INFO:tensorflow:Restoring parameters from /var/folders/2c/gpl2fhbx4kgft314vh0y0qq80000gn/T/tmphxsowu67/model.ckpt-10000


In [41]:
y_pred = []
for p in pred_list:
    y_pred.append(p['class_ids'][0])

In [42]:
from sklearn.metrics import classification_report

In [43]:
report = classification_report(y_test, y_pred, target_names=['stay', 'quit'])

In [44]:
print(report)
# recall    = nb de prediction correctes pour une classe / nb d'item dans la classe
# precision = nb de prédictions correctes pour la classe / nb de prédiction totales pour la classe
# F1-score  = moyenne harmonique de (recall, precision) = 


             precision    recall  f1-score   support

       stay       0.86      0.96      0.90      2378
       quit       0.70      0.40      0.51       622

avg / total       0.83      0.84      0.82      3000



In [None]:
#### linear regression with scaling (batch = 10, steps = 5000)
# stay          0.81      0.98      0.89      2378
# quit          0.62      0.13      0.22       622
# avg / total   0.77      0.80      0.75      3000

#### result without scaling (batch = 10, epochs = 1000, steps = 10000, network = 10,10,10)
# stay           0.86      0.95      0.90      2378
# quit           0.69      0.41      0.51       622
# avg / total    0.82      0.84      0.82      3000