In [7]:
# import libaries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy import stats

In [8]:
# loaded dataset 
df = pd.read_csv("Churn.csv")
print("Dataset Loaded Successfully\n")
print(df.head())

Dataset Loaded Successfully

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Streami

In [9]:
# basic information & missing values 
print("\n--- Dataset Info ---")
print(df.info())
print("\n--- Missing Values ---")
print(df.isnull().sum())


--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  70

In [11]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Impute missing numeric values with median
imputer = SimpleImputer(strategy='median')
df['TotalCharges'] = imputer.fit_transform(df[['TotalCharges']])

# Check missing values again
print("\nMissing Values After Imputation:\n", df.isnull().sum())


Missing Values After Imputation:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [14]:
# outlier & detection & removal 
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]
print(f"\n✅ Outliers Removed — New Shape: {df.shape}")



✅ Outliers Removed — New Shape: (7043, 21)


In [15]:
# encoding target variable first 
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])  # Yes=1, No=0

# Binary encoding (Yes/No) for other columns
for col in df.columns:
    if df[col].dtype == 'object' and df[col].nunique() == 2:
        df[col] = le.fit_transform(df[col])

# One-hot encode remaining categorical features
df = pd.get_dummies(df, drop_first=True)
print("\n✅ Encoding Completed — New Shape:", df.shape)


✅ Encoding Completed — New Shape: (7043, 7073)


In [16]:
# feature scaling 
scaler = StandardScaler()
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = scaler.fit_transform(df[num_cols])
print("\n✅ Feature Scaling Done")


✅ Feature Scaling Done


In [17]:
# feature engineering 
service_cols = [col for col in df.columns if 'Yes' in col or 'InternetService' in col]
if len(service_cols) > 0:
    df['TotalServices'] = df[service_cols].sum(axis=1)

In [18]:
# average monthly charge 
df['AvgMonthlyCharge'] = df['TotalCharges'] / (df['tenure'] + 1)
df['AvgMonthlyCharge'] = df['AvgMonthlyCharge'].fillna(0)

print("\n✅ New Features Created Successfully")


✅ New Features Created Successfully


In [19]:
# train_test_split 
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n✅ Data Split Successful")
print("Train Shape:", X_train.shape, " | Test Shape:", X_test.shape)


✅ Data Split Successful
Train Shape: (5634, 7074)  | Test Shape: (1409, 7074)


In [20]:
# final check 
print("\n Data cleaning + feature engineering done suceesfully - no errors found!")


 Data cleaning + feature engineering done suceesfully - no errors found!
