### 1. Data Prep

In [9]:
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1


In [10]:
df = pd.read_csv('/home/codespace/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [11]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [12]:
categorical_columns = list(df.dtypes[df.dtypes.values == "object"].index)

In [13]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [14]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [15]:
df.totalcharges = df.totalcharges.fillna(0)

In [16]:
df['churn'] = np.where(df['churn'] == 'no', 0, 1)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
import random
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
!pip install tqdm
from tqdm.auto import tqdm



In [18]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [19]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [20]:
df_full_train = df_full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [21]:
y_train = df_train.churn.values
y_test = df_test.churn.values
y_val = df_val.churn.values

In [22]:
numerical = ['totalcharges', 'monthlycharges', 'tenure']

In [23]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
               'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling','paymentmethod']

In [24]:
def train(df_train, y_train, C=1.0):
    # get the necessary columns, convert to dictionary
    dicts = df_train[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False) # Transforms lists of feature-value mappings to vectors.
    X_train = dv.fit_transform(dicts) # dv transforms the feature matrix to vectors

    model = LogisticRegression(C=C, max_iter=5000)
    model.fit(X_train, y_train)
    return dv, model

In [25]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')
    X = dv.fit_transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    return y_pred

In [26]:
kfold = KFold(n_splits = 5, shuffle=True, random_state=1)

In [27]:
y_test
C=1.0
dv, model = train(df_full_train, df_full_train.churn.values, C=C)
y_predict= predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_predict)
auc

0.8583517501381259

In [None]:
y

### Save the model

In [24]:
import pickle

In [26]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [30]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

### Load the model

In [4]:
import pickle

In [5]:
model_file = 'model_C=1.0.bin'

In [6]:
with open(model_file, 'rb') as f_in:
    (dv, model) = pickle.load(f_in)

In [37]:
customer = df_test.iloc[0]
customer

customerid                         8879-zkjof
gender                                 female
seniorcitizen                               0
partner                                    no
dependents                                 no
tenure                                     41
phoneservice                              yes
multiplelines                              no
internetservice                           dsl
onlinesecurity                            yes
onlinebackup                               no
deviceprotection                          yes
techsupport                               yes
streamingtv                               yes
streamingmovies                           yes
contract                             one_year
paperlessbilling                          yes
paymentmethod       bank_transfer_(automatic)
monthlycharges                          79.85
totalcharges                          3320.75
churn                                       0
Name: 0, dtype: object

In [34]:
X = dv.transform([customer])

In [35]:
model.predict_proba(X)

array([[0.93236913, 0.06763087]])