## Data Import

In [None]:
### Packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
### Data loading

df_demographics = pd.read_excel('Telco_customer_churn_demographics.xlsx')
df_location = pd.read_excel('Telco_customer_churn_location.xlsx')
df_services = pd.read_excel('Telco_customer_churn_services.xlsx')
df_status = pd.read_excel('Telco_customer_churn_status.xlsx')

In [None]:
### Merge function

def safe_merge(df_left, df_right):
    common_cols = set(df_left.columns) & set(df_right.columns) - {'Customer ID'}
    df_right_clean = df_right.drop(columns=common_cols)
    return df_left.merge(df_right_clean, on='Customer ID', how='outer')

df_merged = safe_merge(df_demographics, df_location)
df_merged = safe_merge(df_merged, df_services)
df_merged = safe_merge(df_merged, df_status)

In [None]:
### Merged data

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 55 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Count                              7043 non-null   int64  
 2   Gender                             7043 non-null   object 
 3   Age                                7043 non-null   int64  
 4   Under 30                           7043 non-null   object 
 5   Senior Citizen                     7043 non-null   object 
 6   Married                            7043 non-null   object 
 7   Dependents                         7043 non-null   object 
 8   Number of Dependents               7043 non-null   int64  
 9   Location ID                        7043 non-null   object 
 10  Country                            7043 non-null   object 
 11  State                              7043 non-null   objec

In [None]:
## Checking duplicates

df_merged.duplicated().sum()

np.int64(0)

In [None]:
## Target distribution

df_merged['Churn Label'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Churn Label,Unnamed: 1_level_1
No,0.73463
Yes,0.26537


In [None]:
## Checking nulls

df_merged.isna().sum()

Unnamed: 0,0
Customer ID,0
Count,0
Gender,0
Age,0
Under 30,0
Senior Citizen,0
Married,0
Dependents,0
Number of Dependents,0
Location ID,0


In [None]:
### Filling null values

df_merged['Offer'] = df_merged['Offer'].fillna('No')
df_merged['Internet Type'] = df_merged['Internet Type'].fillna('No')
df_merged['Churn Category'] = df_merged['Churn Category'].fillna('No')
df_merged['Churn Reason'] = df_merged['Churn Reason'].fillna('No')

In [None]:
### Dropping columns

cols_to_drop = ['Customer ID', 'Count', 'Under 30', 'Dependents',
                'Location ID', 'Country', 'State', 'Lat Long', 'City',
                'Zip Code', 'Latitude', 'Longitude', 'Service ID',
                'Quarter', 'Referred a Friend', 'Internet Service',
                'Status ID', 'Customer Status', 'Churn Label']

df_merged = df_merged.drop(columns=cols_to_drop)

In [None]:
### Official data

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             7043 non-null   object 
 1   Age                                7043 non-null   int64  
 2   Senior Citizen                     7043 non-null   object 
 3   Married                            7043 non-null   object 
 4   Number of Dependents               7043 non-null   int64  
 5   Number of Referrals                7043 non-null   int64  
 6   Tenure in Months                   7043 non-null   int64  
 7   Offer                              7043 non-null   object 
 8   Phone Service                      7043 non-null   object 
 9   Avg Monthly Long Distance Charges  7043 non-null   float64
 10  Multiple Lines                     7043 non-null   object 
 11  Internet Type                      7043 non-null   objec

In [None]:
### Copied Data

df_copied = df_merged.copy()

## Data Manipulation

### One-hot Encoding

In [None]:
### One-hot Encoding

df_encoded_v1 = pd.get_dummies(df_copied,
                               columns = df_copied.select_dtypes(include='object').columns.tolist(),
                               drop_first=True)

df_encoded_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 67 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Age                                                     7043 non-null   int64  
 1   Number of Dependents                                    7043 non-null   int64  
 2   Number of Referrals                                     7043 non-null   int64  
 3   Tenure in Months                                        7043 non-null   int64  
 4   Avg Monthly Long Distance Charges                       7043 non-null   float64
 5   Avg Monthly GB Download                                 7043 non-null   int64  
 6   Monthly Charge                                          7043 non-null   float64
 7   Total Charges                                           7043 non-null   float64
 8   Total Refunds                         

### Label encoding

In [None]:
### Label Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_encoded_v2 = df_copied.copy()

for col in df_encoded_v2.select_dtypes(include='object'):
    df_encoded_v2[col] = le.fit_transform(df_encoded_v2[col])

df_encoded_v2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             7043 non-null   int64  
 1   Age                                7043 non-null   int64  
 2   Senior Citizen                     7043 non-null   int64  
 3   Married                            7043 non-null   int64  
 4   Number of Dependents               7043 non-null   int64  
 5   Number of Referrals                7043 non-null   int64  
 6   Tenure in Months                   7043 non-null   int64  
 7   Offer                              7043 non-null   int64  
 8   Phone Service                      7043 non-null   int64  
 9   Avg Monthly Long Distance Charges  7043 non-null   float64
 10  Multiple Lines                     7043 non-null   int64  
 11  Internet Type                      7043 non-null   int64