In the previous session we trained a model for predicting churn and evaluated it. Now let's deploy

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/ML ZoomCamp Tutorials/Python Scripts/Churn_data.zip")

data.columns = (data.columns
                .str.lower()
                .str.replace(" ", "_")
               )   
categorical_columns = list(data.dtypes[data.dtypes == 'object'].index)

for c in categorical_columns:
    data[c] = data[c].str.lower().str.replace(" ", "_")
    
    
data.totalcharges = pd.to_numeric(data.totalcharges, errors='coerce')
data.totalcharges = data.totalcharges.fillna(0)

data.churn = (data.churn == 'yes').astype(int)

In [3]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=1)

In [4]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']


categorical = [
 'gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 ]

In [5]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [6]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict('records')
    
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]
    
    
    return y_pred
    

In [7]:
C = 1.0
n_splits = 5

In [10]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    

scores = []
    
for train_idx, val_idx in (kfold.split(df_full_train)):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
    
        y_train = df_train.churn.values
        y_val = df_val.churn.values
    
        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv,  model)
    
        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)
    
print("C=%s %.3f +- %.3f" % (C, np.mean(scores), np.std(scores)))
    

C=1.0 0.841 +- 0.008


In [11]:
scores

[0.8420532062704587,
 0.8455854357038802,
 0.8324182219546458,
 0.8301724275756219,
 0.8526324872140818]

In [13]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv,  model)
    
y_test = df_test.churn.values
auc = roc_auc_score(y_test, y_pred)
auc

0.8572386167896259

## Save The Model

In [14]:
import pickle

In [15]:
output_file = f'model_C=%s.bin' % C
output_file

'model_C=1.0.bin'

In [16]:
f_out = open(output_file, 'wb')
pickle.dump((dv, model), f_out)
f_out.close()

In [17]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

## Load the Model

In [1]:
import pickle

In [2]:
model_file = 'model_C=1.0.bin'

In [3]:
with open(model_file, 'rb') as f_in:
    (dv, model) = pickle.load(f_in)

In [4]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [14]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'no',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_clock',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [18]:
X = dv.transform([customer])

In [19]:
model.predict_proba(X)[0, 1]

0.6235876075904482

Making requests(create a separate notebook for that)