## Read Dataset

In [None]:
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

#%% 
# Read Dataset
import revoscalepy
import pandas
dataset = pandas.read_csv('C://Users//davidchiu//Churn_Modelling.csv')
dataset.head()

## Data Preprocessing

In [None]:

#%%
# Data Preprocessing
dataset = dataset.iloc[:,3:]
dataset.head()


## Feature Engineering

In [None]:
#%%

# Feature Engineering
dataset.select_dtypes('object').head()
geo = pandas.get_dummies(dataset['Geography'])
del geo['France']
gender = pandas.get_dummies(dataset['Gender'])
del gender['Female']
customers = pandas.concat([geo, gender, dataset], axis = 1)
del customers['Geography']
del customers['Gender']
customers.head()



## Get Traning and Testing Data

In [None]:
#%% 

#Get Traning and Testing Data

X = customers.iloc[:,:-1]
Y = customers.iloc[:,-1]

from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X,Y, test_size = 0.2, random_state = 42)



## Trianing Model

In [None]:
#%%
# Trianing Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

clf1 = DecisionTreeClassifier()
clf1.fit(train_X, train_Y)

clf2 = LogisticRegression()
clf2.fit(train_X, train_Y)

clf3 = SVC(probability=True)
clf3.fit(train_X, train_Y)

clf4 = RandomForestClassifier()
clf4.fit(train_X, train_Y)

clf5 = GradientBoostingClassifier()
clf5.fit(train_X, train_Y)


## ROC Curve

In [None]:
#%%
# ROC Curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from matplotlib import pyplot as plt
models = [clf1, clf2, clf3, clf4, clf5]
labels = ['Decision Tree', 'Logistic Regression', 'SVM', 'Random Forest', 'Gradient Boosting']


plt.figure(figsize= [20,10])
for title, clf in zip(labels, models):
    probas_ = clf.predict_proba(test_X)
    fpr, tpr, thresholds = roc_curve(test_Y, probas_[:, 1])
    auc_score = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='%s - AUC:%.2f'%(title, auc(fpr, tpr)) )
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize = 20)
plt.ylabel('True Positive Rate', fontsize = 20)
plt.title('Receiver operating characteristic example', fontsize = 20)
plt.legend(loc="lower right", fontsize = 20)
plt.show()



## Convert to XDF

In [None]:
#%%
# Convert to XDF
import os
if os.path.exists('customers.xdf'):
    os.remove('customers.xdf')
ds = revoscalepy.rx_data_step(df, 'customers.xdf') 
 


## Explore XDF file

In [None]:
#%%
## Explore XDF file
import os
from revoscalepy import rx_summary, RxOptions, RxXdfData
ds = RxXdfData("customers.xdf")
summary = rx_summary(".", ds)
print(summary)




## Build Model With RevoScalepy

In [None]:
#%%
## Build Model With RevoScalepy
from revoscalepy import rx_dforest, rx_predict
formula = "Exited ~ Germany + Spain + Male + CreditScore + Age + Tenure + Balance + NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary"
method = "class"
model = rx_dforest(formula, customers)
pred = rx_predict(model, data = customers)
print(pred.head())