## 1. Load Data and Libraries

In [None]:
%matplotlib inline
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

data = pd.read_csv('/Users/tamielnaicker/Desktop/Telecom Customer Churn/telecom_customer_churn.csv')
data.head()

## 2. All-In-One EDA (Pandas Profiling)

In [None]:
profile = ProfileReport(data, explorative= True)
profile.to_notebook_iframe()

In [None]:
# Data Types (encoding?) 
data_types = data.dtypes
print(data_types)


## 3. Exploratory Data Analysis (EDA)

In [None]:
# Visualise distribution of target variable ('Customer Status')
plt.figure(figsize = (8,6))
sns.countplot(x = 'Customer Status', data = data)
plt.title('Distribtuion of Customer Status')
plt.xlabel('Customer Status')
plt.ylabel('Count')
plt.show()

In [None]:
# Explore correlations between numerical features
plt.figure(figsize=(12,8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot = True, cmap = 'coolwarm', fmt = ".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Explore relationships between: 

#1. Total Charges & Total Revenue (0.97)
plt.figure(figsize=(8,6))
sns.scatterplot(data=data, x = 'Total Charges', y = 'Total Revenue')
plt.title('Scatter Plot: Total Charges vs Total Revenue')
plt.show

#2. Tenure in Months & Total Revenue (0.85)
plt.figure(figsize=(8,6))
sns.scatterplot(data=data, x = 'Tenure in Months', y = 'Total Revenue')
plt.title('Scatter Plot: Tenure in Months vs Total Revenue')
plt.show

#3. Total Charges & Tenure in Months (0.83)
plt.figure(figsize=(8,6))
sns.scatterplot(data=data, x = 'Total Charges', y = 'Tenure in Months')
plt.title('Scatter Plot: Total Charges vs Tenure in Months')
plt.show

#4. Total Long Distance Charges & Total Revenue (0.78)
plt.figure(figsize=(8,6))
sns.scatterplot(data=data, x = 'Total Long Distance Charges', y = 'Total Revenue')
plt.title('Scatter Plot: Total Long Distance Charges vs Total Revenue')
plt.show

#5. Total Long Distance Charges & Tenure in Months (0.67)
plt.figure(figsize=(8,6))
sns.scatterplot(data=data, x = 'Total Long Distance Charges', y = 'Tenure in Months')
plt.title('Scatter Plot: Total Long Distance Charges vs Tenure in Months')
plt.show


# 5. Data Splitting

In [None]:
# Split data

# Define features (X) and target (y)
X = data.drop('Customer Status', axis=1) # X contains features

#Drop non-numeric features
X = X.select_dtypes(include=['int64', 'float64']) # select numeric features
X['Gender'] = data['Gender']
X['Married'] = data['Married']

y = data['Customer Status'] # y contains target

# Split data into 80/20 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

# Check sizes of training and testing sets
print("Training set size: ", len(X_train))
print("Testing set size: ", len(X_test))

# 6. Feature Encoding

In [None]:
# Categorical features
binary_features = ['Gender', 'Married']
#multi_class_features = ['Payment Method', 'Churn Category']

# Label encoding for binary features
label_encoder = LabelEncoder()
for feature in binary_features:
    X_train[feature] = label_encoder.fit_transform(X_train[feature])
    X_test[feature] = label_encoder.transform(X_test[feature])

# 7. Target Encoding

In [None]:
# Encode target feature
encoder_y = OneHotEncoder(sparse=False)

# Fit and transform on the training data
y_train_encoded = encoder_y.fit_transform(y_train.values.reshape(-1, 1))

# Transform the test data using the same encoder and handle unknown categories
y_test_encoded = encoder_y.transform(y_test.values.reshape(-1, 1))


# 8. Scale Data

In [None]:
# Scale data 
from sklearn.preprocessing import StandardScaler

numeric_features = X.select_dtypes(include=['float64', 'int64']).columns # to exclude catergorical/non-numeric features from scaling

scaler = StandardScaler()
X_train_scaled = X_train.copy() # make a copy of X_train to avoid modifying original data
X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])

X_test_scaled = X_test.copy()
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

In [None]:
# Missing Values 
missing_values = X_train_scaled.isnull().sum()
print(missing_values[missing_values > 0])

# Impute 'Avg Monthly Long Distance Charges' & 'Avg Monthly GB Download'
imputer = SimpleImputer(strategy= 'mean')
columns_to_impute = ["Avg Monthly Long Distance Charges", "Avg Monthly GB Download"]

X_train_scaled[columns_to_impute] = imputer.fit_transform(X_train_scaled[columns_to_impute])
X_test_scaled[columns_to_impute] = imputer.transform(X_test_scaled[columns_to_impute])

# 9. Build Model 
The Random Forest Model is a robust ensemble learning algorithm that works well for classification tasks like churn prediction. 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# initialise model 
rf_model = RandomForestClassifier(n_estimators=100, max_depth= 10, min_samples_split= 2, random_state = 42)

# Train the model (fit)
rf_model.fit(X_train_scaled, y_train)

# 10. Performance Metrics

In [None]:
# Predict the lables on the test set 
y_pred = rf_model.predict(X_test_scaled)

# Calculate accuracy 
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

# Matrix Confusion
conf_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", conf_mat)

# Classification Report 
class_report = classification_report(y_test, y_pred)
print("Classification Report: ", class_report)