## Get parameters for this Notebook

Update parameters before you run codes

You need blob Account Name and Key

In [2]:
# Creating widgets for leveraging parameters, and printing the parameters

dbutils.widgets.text("dirpath", "workshop")
dbutils.widgets.text("blobAccountName", "")
dbutils.widgets.text("blobAccountKey", "")
dbutils.widgets.text("blobContainer", "ingest")

dbutils.widgets.get("dirpath")
dirpath = getArgument("dirpath")

dbutils.widgets.get("blobAccountName")
blob = getArgument("blobAccountName")

dbutils.widgets.get("blobAccountKey")
blobkey = getArgument("blobAccountKey")

dbutils.widgets.get("blobContainer")
container = getArgument("blobContainer")

print(dirpath)
print(blob)
print(blobkey)
print(container)

In [3]:
# Create folder and mount blob to the folder
fullpath="/mnt/"+dirpath
dbutils.fs.mkdirs(fullpath)
print(fullpath)

In [4]:
# Mount blob to the folder
dbutils.fs.mount(source = "wasbs://"+container+"@"+blob+".blob.core.windows.net",mount_point = fullpath,extra_configs = {"fs.azure.account.key."+blob+".blob.core.windows.net":blobkey})

In [5]:
%python

df = sqlContext.read.format('csv').options(header='true', inferSchema='true').load(fullpath+"/customerchurnsource.csv")
display(df)

In [6]:
%r
library(SparkR)

dfr <- read.df("/mnt/workshop/customerchurnsource.csv", source = "csv", header="true", inferSchema = "true")

display(dfr)

In [7]:
%scala 

val fullpath = "/mnt/workshop"
val dfs = sqlContext.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(fullpath+"/customerchurnsource.csv")

display(dfs)

In [8]:
%sql
-- mode "FAILFAST" will abort file parsing with a RuntimeException if any malformed lines are encountered
DROP TABLE IF EXISTS tblchurn;

CREATE TABLE IF NOT EXISTS tblchurn
  USING csv
  OPTIONS (path "mnt/workshop/customerchurnsource.csv", header "true", mode "FAILFAST");
  
SELECT * FROM tblchurn

In [9]:
df.printSchema()

In [10]:
display(df)

## Save data as CSV

In [12]:
import pandas as pd
import numpy as np
import csv

In [13]:
df=df.toPandas()

In [14]:
df = df.fillna(0)
df = df.drop_duplicates()
df = df.drop(['year','month'],1)

In [15]:
display(spark.createDataFrame(df))

In [16]:
df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("/mnt/workshop/azmlstudio.csv")

## Run Machine Learning with Sci-kit Learn

In [18]:
# Customer Churn Prediction
import pickle

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder

In [19]:
# One-Hot Encoding
columns_to_encode = list(df.select_dtypes(include=['category','object']))
for column_to_encode in columns_to_encode:
    dummies = pd.get_dummies(df[column_to_encode])
    one_hot_col_names = []
    for col_name in list(dummies.columns):
        one_hot_col_names.append(column_to_encode + '_' + col_name)
    dummies.columns = one_hot_col_names
    df = df.drop(column_to_encode, axis=1)
    df = df.join(dummies)    


In [20]:
model = GaussianNB()

random_seed = 42
train, test = train_test_split(df, random_state = random_seed, test_size = 0.3)

target = train['churn'].values
train = train.drop('churn', 1)
train = train.values
model.fit(train, target)

expected = test['churn'].values
test = test.drop('churn', 1)
predicted = model.predict(test)
print("Naive Bayes Classification Accuracy", accuracy_score(expected, predicted))

In [21]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(train, target)
predicted = dt.predict(test)
print("Decision Tree Classification Accuracy", accuracy_score(expected, predicted))

In [22]:
pickle.dump(model, open("model.pkl", "wb"))