In [None]:
#!pip install scikit-learn
#!pip install graphviz

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.stats import wilcoxon
from scipy import stats
from sklearn.model_selection import train_test_split

global_n_estimators=30
global_max_depth=11
global_random_state=42

<h1>EDA</h1>

In [None]:
df=pd.read_csv('churn.csv')

original_df = df
df.info()

In [None]:
print('Number of rows:',df.shape[0]) 
print('Number of columns:',df.shape[1]) 

In [None]:
df.isnull().sum()

In [None]:
print(f"The amount of duplicated numbers are {df.duplicated().sum()}")

In [None]:
df.head(20)

In [None]:
# basic preperations of the data
df = df.rename(columns=str.lower)

# Conversion to numeric values to display the data
df['totalcharges'] = df['totalcharges'].str.strip() 
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')

df.isnull().sum()

# Replace the n/a with an estimation of the total charge
df['totalcharges'] = df['totalcharges'].fillna((df['monthlycharges']*df['tenure']))  

In [None]:
# Displaying the problematic rows
problem_rows = df[df['totalcharges'].isna()]

problem_rows = problem_rows[['customerid', 'totalcharges']]

print(problem_rows['customerid'].count(),problem_rows)

In [None]:
df.isnull().sum()

In [None]:
df_diff = pd.DataFrame()
df_diff['diff'] = df['totalcharges'] - (df['monthlycharges']*df['tenure'])
# if the number is negative we assume the price raised during the tenure
# if the number is 0 there was no change in price during the tenure
# if the number is positive we assume the price dropped during the tenure

df_diff['price_remain'] = (df_diff['diff'] == 0).astype(int)
df_diff['price_raised'] = (df_diff['diff'] < 0).astype(int)
df_diff['price_dropped'] = (df_diff['diff'] > 0).astype(int)
df_diff['churned'] = df['churn']

df2 = df_diff.groupby('churned')['diff'].agg(['mean', 'std'])
df2

In [None]:
df['churn'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'lightcoral'])
plt.title('churn distribution')
plt.ylabel('') 
plt.show()


In [None]:
#3. התפלגות משתנה עזיבה:
sns.countplot(x='churn', data=df)
plt.title('churn Distribution')
plt.grid(axis='y', linestyle='--')
plt.show()

In [None]:
df[['tenure']].describe()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# totalcharges distribution
axes[0].hist(df['totalcharges'], bins=20, color='skyblue', edgecolor='black')
axes[0].set_title('Distribution of totalcharges')
axes[0].set_xlabel('totalcharges')
axes[0].set_ylabel('Frequency')

# Tenure distribution
axes[1].hist(df['tenure'], bins=20, color='skyblue', edgecolor='black')
axes[1].set_title('Distribution of Tenure')
axes[1].set_xlabel('Tenure')
axes[1].set_ylabel('Frequency')

plt.show()

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(12, 12))

# seperation based on gender
gender_counts = df['gender'].value_counts()
axes[0, 0].pie(gender_counts, labels=["Male", "Female"], autopct='%1.1f%%', colors=['skyblue', 'lightcoral'])
axes[0, 0].set_title('Segmentation by gender')

def get_churned_percentage(group_name, group_value):
    churned_group_a_count = df[(df[group_name]==group_value) & (df['churn']=='Yes')].shape[0]
    total_group_a_count = df[df[group_name]==group_value].shape[0]
    return (churned_group_a_count/total_group_a_count)

group_a_percentage = get_churned_percentage('gender', 'Male')
group_b_percentage = get_churned_percentage('gender', 'Female')
axes[0, 1].pie([group_a_percentage,group_b_percentage], labels=["churned_male", "churned_female"], autopct='%1.1f%%', colors=['skyblue', 'lightcoral'])
axes[0, 1].set_title('Segmentation by gender')

# seperation based on partner
partner_counts = df['partner'].value_counts()
axes[1, 0].pie(partner_counts, labels=['Without a spouse','With a spouse'], autopct='%1.1f%%', colors=['lightgreen', 'lightpink'])
axes[1, 0].set_title('Segmentation by spouse')

group_a_percentage = get_churned_percentage('partner', 'Yes')
group_b_percentage = get_churned_percentage('partner', 'No')
axes[1, 1].pie([group_a_percentage,group_b_percentage], labels=["churned_with_spouse", "churned_without_spouse"], autopct='%1.1f%%', colors=['lightgreen', 'lightpink'])
axes[1, 1].set_title('Segmentation by spouse')

# seperation based on dependents
dependents_counts = df['dependents'].value_counts()
axes[2, 0].pie(dependents_counts, labels=['Without dependents', 'With dependents'], autopct='%1.1f%%', colors=['lightgrey', 'lightyellow'])
axes[2, 0].set_title('Segmentation by dependents')

group_a_percentage = get_churned_percentage('dependents', 'Yes')
group_b_percentage = get_churned_percentage('dependents', 'No')
axes[2, 1].pie([group_a_percentage,group_b_percentage], labels=["churned_with_dependents", "churned_without_dependents"], autopct='%1.1f%%', colors=['lightgrey', 'lightyellow'])
axes[2, 1].set_title('Segmentation by dependents')

plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# totalcharges
df.boxplot(column='totalcharges', ax=axes[0])
axes[0].set_title('Boxplot of Total Charges')
axes[0].set_ylabel('Total Charges')

# Tenure
df.boxplot(column='tenure', ax=axes[1])
axes[1].set_title('Boxplot of Tenure')
axes[1].set_ylabel('Tenure')

plt.show()

In [None]:
# monthlycharges
plt.figure(figsize=(12, 6))
sns.histplot(df['monthlycharges'], bins=30, kde=True)
plt.title('Monthly distribution')
plt.xlabel('Monthly Charges')
plt.grid(axis='y', linestyle='--')
plt.show()

# contract
plt.figure(figsize=(12, 6))
sns.countplot(x='contract',  data=df) 
plt.title('contract Type counts')
plt.grid(axis='y', linestyle='--')
plt.show()

<h1>Data Engineering / Prep </h1>

In [None]:
def create_dummies(data_frame, column_name):
    return pd.get_dummies(data_frame, columns=[column_name], prefix=column_name, drop_first=True,dtype=int)

In [None]:
df['charge_diff'] = df['totalcharges'] - (df['monthlycharges']*df['tenure'])
# if the number is negative we assume the price raised during the tenure
# if the number is 0 there was no change in price during the tenure
# if the number is positive we assume the price dropped during the tenure
df['price_remain'] = (df['charge_diff'] == 0).astype(int)
df['price_raised'] = (df['charge_diff'] < 0).astype(int)
df['price_dropped'] = (df['charge_diff'] > 0).astype(int)


df.loc[df.churn == 'No','churn'] =0
df.loc[df.churn == 'Yes','churn'] =1
df['churn'] = df['churn'].astype(int)

df.loc[df.gender == 'Male','gender']=0
df.loc[df.gender == 'Female','gender']=1
df['gender'] = df['gender'].astype(int)

df.loc[df.partner == 'No','partner']=0
df.loc[df.partner == 'Yes','partner']=1
df['partner'] = df['partner'].astype(int)

df.loc[df.dependents == 'No','dependents']=0
df.loc[df.dependents == 'Yes','dependents']=1
df['dependents'] = df['dependents'].astype(int)

df.loc[df.phoneservice == 'No','phoneservice'] =0
df.loc[df.phoneservice == 'Yes','phoneservice'] =1
df['phoneservice'] = df['phoneservice'].astype(int)

df.loc[df.paperlessbilling == 'No','paperlessbilling'] = 0
df.loc[df.paperlessbilling == 'Yes','paperlessbilling'] =1
df['paperlessbilling'] = df['paperlessbilling'].astype(int)

df = create_dummies(df, 'contract')
df = create_dummies(df, 'multiplelines')
df = create_dummies(df, 'internetservice')
df = create_dummies(df, 'onlinesecurity')
df = create_dummies(df, 'onlinebackup')
df = create_dummies(df, 'deviceprotection')
df = create_dummies(df, 'techsupport')
df = create_dummies(df, 'streamingtv')
df = create_dummies(df, 'streamingmovies')
df = create_dummies(df, 'paymentmethod')

customer_id = df['customerid']

print(df.info())
df.head(20)

In [None]:
df['combined'] = df['internetservice_Fiber optic'] | df['internetservice_No'] | \
       df['onlinesecurity_No internet service'] | df['onlinesecurity_Yes'] | \
       df['onlinebackup_No internet service'] | df['onlinebackup_Yes'] | \
       df['deviceprotection_No internet service'] | df['deviceprotection_Yes'] | \
       df['techsupport_No internet service'] | df['techsupport_Yes'] | \
       df['streamingtv_No internet service'] | df['streamingtv_Yes'] | \
       df['streamingmovies_No internet service'] | df['streamingmovies_Yes']

sum_all_combined = df['internetservice_Fiber optic'] + df['internetservice_No'] + \
       df['onlinesecurity_No internet service'] + df['onlinesecurity_Yes'] + \
       df['onlinebackup_No internet service'] + df['onlinebackup_Yes'] + \
       df['deviceprotection_No internet service'] + df['deviceprotection_Yes'] + \
       df['techsupport_No internet service'] + df['techsupport_Yes'] + \
       df['streamingtv_No internet service'] + df['streamingtv_Yes'] + \
       df['streamingmovies_No internet service'] + df['streamingmovies_Yes']
df['combined_majority'] = (sum_all_combined > 6).astype(int)

df.drop(['internetservice_Fiber optic', 'internetservice_No','onlinesecurity_No internet service','onlinesecurity_Yes'], axis=1, inplace=True)
df.drop(['onlinebackup_No internet service','onlinebackup_Yes','deviceprotection_No internet service', 'deviceprotection_Yes'], axis=1, inplace=True)
df.drop(['techsupport_No internet service', 'techsupport_Yes','streamingtv_No internet service', 'streamingtv_Yes'], axis=1, inplace=True)
df.drop(['streamingmovies_No internet service', 'streamingmovies_Yes'], axis=1, inplace=True)
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
# low-resolution categorization
def categorize_tenure_set1(tenure):
    if tenure <= 12:
        return '0-12'
    elif 13 <= tenure <= 36:
        return '13-36'
    elif 37 <= tenure <= 60:
        return '37-60'
    else:
        return '60+'

# high-resolution categorization
def categorize_tenure_set2(tenure):
    if tenure <= 6:
        return '0-6'
    elif 7 <= tenure <= 12:
        return '7-12'
    elif 13 <= tenure <= 18:
        return '13-18'
    elif 19 <= tenure <= 24:
        return '19-24'
    elif 25 <= tenure <= 30:
        return '25-30'
    elif 31 <= tenure <= 36:
        return '31-36'
    elif 37 <= tenure <= 42:
        return '37-42'
    elif 43 <= tenure <= 48:
        return '43-48'
    elif 49 <= tenure <= 54:
        return '49-54'
    elif 55 <= tenure <= 60:
        return '55-60'
    else:
        return '60+'

# medium-resolution categorization
def categorize_tenure_set3(tenure):
    if tenure <= 10:
        return '0-10'
    elif 11 <= tenure <= 20:
        return '11-20'
    elif 21 <= tenure <= 30:
        return '21-30'
    elif 31 <= tenure <= 40:
        return '31-40'
    elif 41 <= tenure <= 50:
        return '41-50'
    elif 51 <= tenure <= 60:
        return '51-60'
    else:
        return '60+'

df['tenure_category_set1'] = df['tenure'].apply(categorize_tenure_set1)
df['tenure_category_set2'] = df['tenure'].apply(categorize_tenure_set2)
df['tenure_category_set3'] = df['tenure'].apply(categorize_tenure_set3)

categories_set1 = ['0-12', '13-36', '37-60', '60+']
categories_set2 = ['0-6', '7-12', '13-18', '19-24', '25-30', '31-36', '37-42', '43-48', '49-54', '55-60', '60+']
categories_set3 = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '60+']

In [None]:
stats_set1 = df.groupby(['churn','tenure_category_set1'])['monthlycharges'].agg(['mean'])
stats_set2 = df.groupby(['churn','tenure_category_set2'])['monthlycharges'].agg(['mean'])
stats_set3 = df.groupby(['churn','tenure_category_set3'])['monthlycharges'].agg(['mean'])

avg_std_set1 = stats_set1.groupby('churn')[['mean']].agg('mean')
avg_std_set2 = stats_set2.groupby('churn')[['mean']].agg('mean')
avg_std_set3 = stats_set3.groupby('churn')[['mean']].agg('mean')

In [None]:
print(stats_set1)
print(avg_std_set1)
print(stats_set2)
print(avg_std_set2)
print(stats_set3)
print(avg_std_set3)

In [None]:
unstacked = stats_set1.unstack()
unstacked.plot(kind='bar', figsize=(12, 6))

plt.title('churn segmentation by avg monthlycharges (Set 1)')
plt.xlabel('tenure groups')
plt.legend(title='churn')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
unstacked = stats_set2.unstack()
unstacked.plot(kind='bar', figsize=(12, 6))

plt.title('churn segmentation by avg monthlycharges (Set 2)')
plt.xlabel('tenure groups')
plt.legend(title='churn')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
unstacked = stats_set3.unstack()
unstacked.plot(kind='bar', figsize=(12, 6))

plt.title('churn segmentation by avg monthlycharges (Set 3)')
plt.xlabel('tenure groups')
plt.legend(title='churn')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
data_set2 = df.groupby(['churn','tenure_category_set2'])['partner'].agg(['count'])
unstacked = data_set2.unstack()
unstacked.plot(kind='bar', figsize=(12, 6))

plt.title('churn segmentation by amount of people in tenure group (Set 2)')
plt.xlabel('tenure groups')
plt.legend(title='tenure')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
data_set1 = df.groupby(['churn','tenure_category_set1'])['partner'].agg(['count'])
unstacked = data_set1.unstack()
unstacked.plot(kind='bar', figsize=(12, 6))

plt.title('churn segmentation by amount of people in tenure group (Set 1)')
plt.xlabel('tenure groups')
plt.legend(title='tenure')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
data_set3 = df.groupby(['churn','tenure_category_set3'])['partner'].agg(['count'])
unstacked = data_set3.unstack()
unstacked.plot(kind='bar', figsize=(12, 6))

plt.title('churn segmentation by amount of people in tenure group (Set 3)')
plt.xlabel('tenure groups')
plt.legend(title='tenure')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
# correlation calcultaiton
corr_matrix = df.corr(numeric_only=True)

# heatmap presentation of the correlation
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
corr_totalcharges = corr_matrix[['churn']].abs().sort_values(by='churn', ascending=False)

plt.figure(figsize=(6, 8))
sns.heatmap(corr_totalcharges, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation with Churn")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['tenure'], y=df['totalcharges'], alpha=0.5)
plt.title("Total Charges vs Tenure")
plt.xlabel("Tenure")
plt.ylabel("Total Charges")
plt.grid()
plt.show()


plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['monthlycharges'], y=df['totalcharges'], alpha=0.5)
plt.title("Total Charges vs Monthly Charges")
plt.xlabel("Monthly Charges")
plt.ylabel("Total Charges")
plt.grid()
plt.show()

In [None]:
#final preps
df = df.drop(columns=['tenure_category_set1'])
df = df.drop(columns=['tenure_category_set2'])
df = df.drop(columns=['tenure_category_set3'])

df = df.reindex(sorted(df.columns), axis=1)

<h1> ML </h1>

In [None]:
from sklearn.model_selection import train_test_split

test_size = round(df.index.size * 0.1)
train, test = train_test_split(df, test_size=test_size , random_state=global_random_state, shuffle=True)

label = 'churn'
cus_id = 'customerid'

x_train = train.drop(label, axis=1)
x_train = x_train.drop(cus_id, axis=1)
y_train = train[label]
cus_id_train = train[cus_id]

x_test = test.drop(label, axis=1)
x_test = x_test.drop(cus_id, axis=1)
y_test = test[label]
cus_id_test = test[cus_id]

x_train = x_train.astype(float)
y_train = y_train.astype(float)
x_test = x_test.astype(float)
y_test = y_test.astype(float)

In [None]:
train

In [None]:
test

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
cus_id_train

In [None]:
cus_id_test

In [None]:
x_train.shape, y_train.shape, cus_id_train.shape, x_test.shape, y_test.shape, cus_id_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 

original_x_train = x_train
original_x_test = x_test

x_train = scaler.fit_transform(x_train) 
x_test = scaler.transform(x_test) 

In [None]:
original_x_train

In [None]:
x_train

In [None]:
# Plotting the decision tree
# Importing the necessary libraries
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

# this is a function that we can always use for plotting decision trees, the function expects 3 arg as follows
def plot_tree(tree, features, labels):
    graph = Source(export_graphviz(tree, feature_names=features, class_names=labels, filled = True))
    display(SVG(graph.pipe(format='svg')))

<h2> Decision Tree </h2>

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(max_depth=8)
model.fit(x_train, y_train) 

y_test_pred_DecisionTree = model.predict(x_test)

output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_DecisionTree}) 

test_acc = accuracy_score(y_test, y_test_pred_DecisionTree)

print("Accuracy = ", test_acc)
output.head(10)

In [None]:
iterations = list(range(1,30))
test_acc = []
for max_dept in iterations:
    model = DecisionTreeClassifier(max_depth=max_dept)
    model.fit(x_train, y_train) 
    y_test_pred_DecisionTree = model.predict(x_test)
    output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_DecisionTree}) 
    test_acc.append( accuracy_score(y_test, y_test_pred_DecisionTree) )

plt.plot(iterations, test_acc)
plt.title('Accuracy as higher max dept')
plt.xlabel('max dept')
plt.ylabel('Accuracy')
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()

d = dict(zip(test_acc,iterations))
print(f"maximum accuracy at max dept:{d[max(d.keys())]} with accuracy={max(d.keys())}") 

In [None]:
plot_tree(model, original_x_train.columns, ['Not Churned', 'Churned'])

<h2> Random Forest model </h2>

In [None]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=global_n_estimators, max_depth=global_max_depth, random_state=global_random_state)

model.fit(x_train, y_train) 

y_test_pred_RandomForest = model.predict(x_test) 

output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_RandomForest})

test_acc = accuracy_score(y_test, y_test_pred_RandomForest)
print("Accuracy = ", test_acc)
output.head(10)

In [None]:
iterations = list(range(1,50))
test_acc = []
for n in iterations:
    model = RandomForestClassifier(n_estimators=n, max_depth=global_max_depth, random_state=global_random_state)
    model.fit(x_train, y_train) 
    y_test_pred_RandomForest = model.predict(x_test) 
    output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_RandomForest})
    test_acc.append( accuracy_score(y_test, y_test_pred_RandomForest) )

plt.plot(iterations, test_acc)
plt.title('Accuracy as higher n_estimators')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()

d = dict(zip(test_acc,iterations))
print(f"maximum accuracy at n_estimators:{d[max(d.keys())]} with accuracy={max(d.keys())}") 

In [None]:
iterations = list(range(1,50))
test_acc = []
for n in iterations:
    model = RandomForestClassifier(n_estimators=global_n_estimators, max_depth=n, random_state=global_random_state)
    model.fit(x_train, y_train) 
    y_test_pred_RandomForest = model.predict(x_test) 
    output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_RandomForest})
    test_acc.append( accuracy_score(y_test, y_test_pred_RandomForest) )

plt.plot(iterations, test_acc)
plt.title('Accuracy as higher max_depth')
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()

d = dict(zip(test_acc,iterations))
print(f"maximum accuracy at max_depth:{d[max(d.keys())]} with accuracy={max(d.keys())}") 

<h2> KNN - K Nearest Neighbors </h2>

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score

model = KNeighborsClassifier(n_neighbors=26)

model.fit(x_train, y_train) 

y_test_pred_KNN = model.predict(x_test) 

output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_KNN})

test_acc = accuracy_score(y_test, y_test_pred_KNN)
print("Accuracy = ", test_acc)
output.head(10)

In [None]:
iterations = list(range(1,50))
test_acc = []
for n in iterations:
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(x_train, y_train) 
    y_test_pred_KNN = model.predict(x_test) 
    output = pd.DataFrame({cus_id: cus_id_test, 'churned_what_actualy_happened':y_test, 'churned_predicted_by_model': y_test_pred_KNN})
    test_acc.append( accuracy_score(y_test, y_test_pred_KNN) ) 

plt.plot(iterations, test_acc)
plt.title('Accuracy as higher n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.grid(linestyle='--')
plt.tight_layout()
plt.show()

d = dict(zip(test_acc,iterations))
print(f"maximum accuracy at n_neighbors:{d[max(d.keys())]} with accuracy={max(d.keys())}") 

In [None]:
import numpy as np
def get_benchmark_predictions(x, benchmark_value): # The function returns a numpy array on the same length as x with all values equal to benchmark_value
    return np.ones(len(x))*benchmark_value

In [None]:
benchmark_value = 0

y_test_pred_Benchmark_0 = get_benchmark_predictions(x_test, benchmark_value)
test_acc = accuracy_score(y_test, y_test_pred_Benchmark_0)
print("Accuracy = ", test_acc)

<h2> Summary of the results </h2>
<b>Decision Tree</b>
maximum accuracy at max dept:6 with accuracy=0.7802275960170697

<b>Random Forest</b>
maximum accuracy at <b>n_estimators:24, 9 max dept with accuracy=0.8001422475106685</b>

<b>KNN</b>
maximum accuracy at n_neighbors:39 with accuracy=0.7837837837837838

<b>We choose to use Random Forest with 24 estimators and max dept of 9</b>

<h2>Build the model for deplopyment</h2>
Now we build the model on all dataset and save for deployment


In [None]:
from sklearn.model_selection import train_test_split

test_size = round(df.index.size * 0.1)
train, test = train_test_split(df, test_size=test_size, random_state=global_random_state, shuffle=True)

label = 'churn'
cus_id = 'customerid'

x_train = train.drop(label, axis=1)
x_train = x_train.drop(cus_id, axis=1)
y_train = train[label]
cus_id_train = train[cus_id]

x_test = test.drop(label, axis=1)
x_test = x_test.drop(cus_id, axis=1)
y_test = test[label]
cus_id_test = test[cus_id]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

customers = original_df

print(customers.shape)
customers.head(5)

In [None]:
def create_dummies(data_frame, column_name):
    return pd.get_dummies(data_frame, columns=[column_name], prefix=column_name, drop_first=True,dtype=int)

def prep(df_features):
    # basic preperations of the data
    df_features = df_features.rename(columns=str.lower)
    
    # הפכיה לערכים מספריים על מנת להציג את הנתונים
    df_features['totalcharges'] = df_features['totalcharges'].str.strip() 
    df_features['totalcharges'] = pd.to_numeric(df_features['totalcharges'], errors='coerce')
   
    # Replace the n/a with an estimation of the total charge
    df_features['totalcharges'] = df_features['totalcharges'].fillna((df_features['monthlycharges']*df_features['tenure']))  

    df_features['charge_diff'] = df_features['totalcharges'] - (df_features['monthlycharges']*df_features['tenure'])
    # if the number is negative we assume the price raised during the tenure
    # if the number is 0 there was no change in price during the tenure
    # if the number is positive we assume the price dropped during the tenure
    df_features['price_remain'] = (df_features['charge_diff'] == 0).astype(int)
    df_features['price_raised'] = (df_features['charge_diff'] < 0).astype(int)
    df_features['price_dropped'] = (df_features['charge_diff'] > 0).astype(int)
      
    df_features.loc[df_features.churn == 'No','churn'] =0
    df_features.loc[df_features.churn == 'Yes','churn'] =1
    df_features['churn'] = df_features['churn'].astype(int)
    
    df_features.loc[df_features.gender == 'Male','gender']=0
    df_features.loc[df_features.gender == 'Female','gender']=1
    df_features['gender'] = df_features['gender'].astype(int)
    
    df_features.loc[df_features.partner == 'No','partner']=0
    df_features.loc[df_features.partner == 'Yes','partner']=1
    df_features['partner'] = df_features['partner'].astype(int)
    
    df_features.loc[df_features.dependents == 'No','dependents']=0
    df_features.loc[df_features.dependents == 'Yes','dependents']=1
    df_features['dependents'] = df_features['dependents'].astype(int)
    
    df_features.loc[df_features.phoneservice == 'No','phoneservice'] =0
    df_features.loc[df_features.phoneservice == 'Yes','phoneservice'] =1
    df_features['phoneservice'] = df_features['phoneservice'].astype(int)
    
    df_features.loc[df_features.paperlessbilling == 'No','paperlessbilling'] = 0
    df_features.loc[df_features.paperlessbilling == 'Yes','paperlessbilling'] =1
    df_features['paperlessbilling'] = df_features['paperlessbilling'].astype(int)
    
    df_features = create_dummies(df_features, 'contract')
    df_features = create_dummies(df_features, 'multiplelines')
    df_features = create_dummies(df_features, 'internetservice')
    df_features = create_dummies(df_features, 'onlinesecurity')
    df_features = create_dummies(df_features, 'onlinebackup')
    df_features = create_dummies(df_features, 'deviceprotection')
    df_features = create_dummies(df_features, 'techsupport')
    df_features = create_dummies(df_features, 'streamingtv')
    df_features = create_dummies(df_features, 'streamingmovies')
    df_features = create_dummies(df_features, 'paymentmethod')

    df_features['combined'] = df_features['internetservice_Fiber optic'] | df_features['internetservice_No'] | \
           df_features['onlinesecurity_No internet service'] | df_features['onlinesecurity_Yes'] | \
           df_features['onlinebackup_No internet service'] | df_features['onlinebackup_Yes'] | \
           df_features['deviceprotection_No internet service'] | df_features['deviceprotection_Yes'] | \
           df_features['techsupport_No internet service'] | df_features['techsupport_Yes'] | \
           df_features['streamingtv_No internet service'] | df_features['streamingtv_Yes'] | \
           df_features['streamingmovies_No internet service'] | df_features['streamingmovies_Yes']
    
    sum_all_combined = df_features['internetservice_Fiber optic'] + df_features['internetservice_No'] + \
           df_features['onlinesecurity_No internet service'] + df_features['onlinesecurity_Yes'] + \
           df_features['onlinebackup_No internet service'] + df_features['onlinebackup_Yes'] + \
           df_features['deviceprotection_No internet service'] + df_features['deviceprotection_Yes'] + \
           df_features['techsupport_No internet service'] + df_features['techsupport_Yes'] + \
           df_features['streamingtv_No internet service'] + df_features['streamingtv_Yes'] + \
           df_features['streamingmovies_No internet service'] + df_features['streamingmovies_Yes']
    df_features['combined_majority'] = (sum_all_combined > 6).astype(int)
    
    df_features.drop(['internetservice_Fiber optic', 'internetservice_No','onlinesecurity_No internet service','onlinesecurity_Yes'], axis=1, inplace=True)
    df_features.drop(['onlinebackup_No internet service','onlinebackup_Yes','deviceprotection_No internet service', 'deviceprotection_Yes'], axis=1, inplace=True)
    df_features.drop(['techsupport_No internet service', 'techsupport_Yes','streamingtv_No internet service', 'streamingtv_Yes'], axis=1, inplace=True)
    df_features.drop(['streamingmovies_No internet service', 'streamingmovies_Yes'], axis=1, inplace=True)
    
    customer_id = df_features['customerid']

    y = df_features['churn'] # Save the target variable

    # Let's drop the PassengerId column and Survived column from the features
    X = df_features.drop(columns=['customerid', 'churn'])

    X = X.reindex(sorted(X.columns), axis=1)
    
    # Let's convert all data to float because some modules warn against other types
    X = X.astype(float)
    y = y.astype(float)

    return X, y, customer_id

In [None]:
X, y, customer_id = prep(customers)

display(X)
display(y)
display(customer_id)

In [None]:
# No missing values!
X.isna().sum()

In [None]:
# Check all values are indeed numeric (float)
X.dtypes

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=global_n_estimators, max_depth=global_max_depth, random_state=global_random_state))
])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split into training and test sets
test_size = round(df.index.size * 0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=global_random_state, shuffle=True)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Final Model Accuracy:", accuracy)

In [None]:
# Train the pipeline on the entire dataset (train + test) for deployment
pipeline.fit(X, y)

In [None]:
import joblib

# Save the pipeline using joblib
filename = "customer_final_pipeline.pkl"
joblib.dump(pipeline, filename)

print(f"Pipeline saved as {filename}")