In [37]:
#import dependencies :  
import pickle
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold 

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_auc_score 


In [29]:

# parameters

C = 1.0
n_splits = 5
output_file = f'model_C={C}.bin'

In [11]:
#import the data : 

path_data  = "data.csv"

df = pd.read_csv(path_data)

#change the names of all the columns : to lower case and replace spaces with _ 
df.columns  = df.columns.str.lower().str.replace(" " , "_")
categorical_columns  = list(df.select_dtypes(include = ["object"]).columns)
for c in categorical_columns : 
    df[c] = df[c].str.lower().str.replace(" ", "_")
    
#end of the loop 
#make the changes to the  totalcharges 
df.totalcharges   =  pd.to_numeric(df.totalcharges , errors = "coerce")
df.totalcharges = df.totalcharges.fillna(0)

df.churn  = (df.churn == "yes").astype(int)

In [12]:
# spliting the data into train data and test data : 
df_full_train ,  df_test  = train_test_split(df ,  test_size = 0.2 , random_state=1)

In [12]:
######### numerical  features ######################
numerical = ['tenure', 'monthlycharges', 'totalcharges']
############  categorical features #################### 
categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [18]:
from sklearn.feature_extraction import DictVectorizer

data = [
    {'Color': 'Red', 'Taste': 'Sweet'},
    {'Color': 'Yellow', 'Taste': 'Sweet'},
    {'Color': 'Yellow', 'Taste': 'Sour'}
]

vectorizer = DictVectorizer(sparse=False)
transformed_data = vectorizer.fit_transform(data)

print(data , transformed_data)

[{'Color': 'Red', 'Taste': 'Sweet'}, {'Color': 'Yellow', 'Taste': 'Sweet'}, {'Color': 'Yellow', 'Taste': 'Sour'}] [[1. 0. 0. 1.]
 [0. 1. 0. 1.]
 [0. 1. 1. 0.]]


In [20]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [10]:
# define a functin for the training : 
def train(df_train , y_train , C=1.0 ):
    #create the data that we need to train the model with before usign the dictvectorizer: 
    dicts  = df_train[categorical + numerical ].to_dict(orient = "records")
    #instanciate the  DictVictorizer  
    dv = DictVectorizer(sparse = False)
    X_train =  dv.fit_transform(dicts)
    # instanciate the logistic regression model : 
    model  = LogisticRegression(C= C , max_iter=1000)
    model.fit(X_train, y_train)
    return dv , model 

In [9]:
## now i need to create predict function 
def predict(df , dv , model):
    #prperocess the data before give it to the dictVectorizer 
    x  =  df[categorical + numerical].to_dict(orient = "records")
    #transformt the x_train data : 
    x_test  = dv.transform(x)
    y_predct  = model.predict_proba(x_test)[:, 1]
    return y_predct

In [34]:
#validation 

In [35]:
print(f"doing the validation know with the C={C}")

doing the validation know with the C=1.0


In [38]:
# instanciate the KFold modul : 
kfold = KFold(n_splits=n_splits, shuffle=True , random_state=1)
scores  = [] 
fold = 0
for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

    print(f'auc on fold {fold} is {auc}')
    fold = fold + 1
    

print('validation results:')
print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

# training the final model

print('training the final model')

dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test.churn.values
auc = roc_auc_score(y_test, y_pred)

print(f'auc={auc}')


# Save the model

with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

print(f'the model is saved to {output_file}')
 

auc on fold 0 is 0.8423828998908871
auc on fold 1 is 0.8455854357038802
auc on fold 2 is 0.8307405177603852
auc on fold 3 is 0.8301724275756219
auc on fold 4 is 0.8522677355589721
validation results:
C=1.0 0.840 +- 0.009
training the final model
auc=0.8572386167896259
the model is saved to model_C=1.0.bin


In [1]:
# load the model 

In [2]:
import pickle 

In [4]:
#  we need to load the dict vectorizer and also the 
#create a variable name path :
input_file = "model_C=1.0.bin"
with open(input_file, "rb") as  in_file:
    (dv,model) = pickle.load(in_file)
    
dv, model 

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [22]:
#create a sample to test if it is working or not : 
# we have a dict that we need to convert to dataframe : 
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75
}

In [6]:
#reinport the pandas : 

In [13]:
import pandas as pd 
t_test  = pd.DataFrame([customer])
y_predict_1 = predict(t_test , dv ,model)

In [15]:
print(y_predict_1[0])

0.06224295541362429


In [17]:
def predict_single(customer):
    t_test  = pd.DataFrame([customer])
    y_predict_1 = predict(t_test , dv ,model)
    if y_predict_1 < 0.5 : 
        print(f"the result is false with  :{y_predict_1}")
    else:
        print(f"the result is true with: {y_predict_1}")
    

In [18]:
predict_single(customer)

the result is false with  :[0.06224296]


In [27]:

X = dv.transform([customer])
model.predict_proba(X)[0][0]

0.9377570445863757

In [28]:
!pwd

/home/yassine/mlzoomcamp/churn_prediction
