## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [None]:
import pandas as pd

In [None]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [None]:
customer_row = {
    'gender': gender,
    'SeniorCitizen': senior_citizen, 
    'Partner': partner, 
    'Dependents': dependents,
    'tenure': tenure,
    'PhoneService': phone_service,
    'MultipleLines': multiple_lines,
    'InternetService': internet_service,
    'OnlineSecurity': online_security,
    'OnlineBackup': online_backup,
    'DeviceProtection': device_protection,
    'TechSupport': tech_support,
    'StreamingTV': streaming_tv,
    'StreamingMovies': streaming_movies,
    'Contract': contract,
    'PaperlessBilling': paperless_billing,
    'PaymentMethod': payment_method,
    'MonthlyCharges': monthly_charges,
    'TotalCharges': total_charges
}

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(train['gender'])

### le.transform(train['gender']) ## remove

transformed_column = le.transform(train['gender'])



In [None]:
train.columns

In [None]:
## from above copy and remove numberical columns
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

In [None]:
column_mapper = {}

for column in categorical_columns:
    le = LabelEncoder()
    le.fit(train.loc[:, column])
    # train.loc[:, column] = le.transform(train.loc[:, column])
    column_mapper[column] = le

In [None]:
def pre_process_data(df, label_encoder_dict):
    df_out = df.copy()
    df_out.replace(" ", 0, inplace=True)
    df_out.loc[:, 'TotalCharges'] = pd.to_numeric(df_out.loc[:, 'TotalCharges'])

    if 'customerID' in df_out.columns:
        df_out.drop('customerID', axis=1, inplace=True)

    for column, le in label_encoder_dict.items():
        df_out.loc[:, column] = le.transform(df_out.loc[:, column])

    return df_out

In [None]:
train_processed = pre_process_data(train, column_mapper)
val_processed = pre_process_data(val, column_mapper)
val_processed

In [None]:
x_train = train_processed.drop('Churn', axis=1)
y_train = train_processed.loc[:, 'Churn'].astype(int) ## error after changing to int

y_train

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [None]:
x_val = val_processed.drop('Churn', axis=1)
y_val = val_processed.loc[:, 'Churn'].astype(int)

predictions = model.predict(x_val)
predictions

In [None]:
pd.DataFrame(model.coef_, columns=x_val.columns)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_val, predictions)
print(f"Classification accuracy: {round(accuracy, 3)}")

In [None]:
import pickle

with open("./models/churn_prediction_model.pkl", "wb") as pickler:
    pickle.dump(model, pickler)

with open("./models/churn_prediction_label_encoder.pkl", "wb") as pickler:
    pickle.dump(column_mapper, pickler)