<a href="https://colab.research.google.com/github/thamadhi/telco-customer-churn-CM2604/blob/main/Notebooks/Preprocessing_Task_2_Scaled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/thamadhi/telco-customer-churn-CM2604.git


Cloning into 'telco-customer-churn-CM2604'...
remote: Enumerating objects: 100, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 100 (delta 53), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (100/100), 2.77 MiB | 7.78 MiB/s, done.
Resolving deltas: 100% (53/53), done.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

In [3]:
# Load data
data = 'telco-customer-churn-CM2604/data/Telco-Customer-Churn-unprocessed.csv'
df = pd.read_csv(data)

In [4]:
#remove empty strings
df = df[df['TotalCharges'] != ' ']
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [5]:
# Convert Yes/No columns to 1/0
yes_no_cols = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity',
               'OnlineBackup','DeviceProtection','TechSupport','StreamingTV',
               'StreamingMovies','PaperlessBilling','Churn','gender']
for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].replace({'Yes': 1, 'No': 0, 'Female': 1, 'Male': 0})

  df[col] = df[col].replace({'Yes': 1, 'No': 0, 'Female': 1, 'Male': 0})


In [6]:
# Fix the columns that still have these values
service_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in service_cols:
    if col in df.columns:
        df[col] = df[col].replace(['No internet service', 'No phone service'], 0)

  df[col] = df[col].replace(['No internet service', 'No phone service'], 0)


In [7]:
#Drop customer ID
df = df.drop(columns=['customerID'])

In [8]:
# Separate X, y
X = df.drop('Churn', axis=1)
y = df['Churn']

In [9]:
# Identify true categorical columns (non-numeric, not already encoded)
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [10]:
# Get object columns and check data types
categorical_features = []
binary_features = []

for col in X.columns:
    if col not in numeric_features:
        # Check if column is object type (strings) or has more than 2 unique values
        if X[col].dtype == 'object':
            categorical_features.append(col)
        elif X[col].nunique() > 2:
            categorical_features.append(col)
        elif X[col].nunique() == 2:
            binary_features.append(col)

print(f"Numeric features: {numeric_features}")
print(f"Categorical features to encode: {categorical_features}")
print(f"Binary features (already encoded): {binary_features}")

Numeric features: ['tenure', 'MonthlyCharges', 'TotalCharges']
Categorical features to encode: ['InternetService', 'Contract', 'PaymentMethod']
Binary features (already encoded): ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']


In [11]:
# Check data types in categorical features
print("\nChecking data types in categorical features:")
for col in categorical_features:
    print(f"{col}: dtype={X[col].dtype}, unique values={X[col].unique()[:5]}")


Checking data types in categorical features:
InternetService: dtype=object, unique values=['DSL' 'Fiber optic' 'No']
Contract: dtype=object, unique values=['Month-to-month' 'One year' 'Two year']
PaymentMethod: dtype=object, unique values=['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


In [12]:
#Preprocessor: one-hot for categoricals, scale numerics
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

if categorical_features:
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
else:
    transformers = [
        ('num', numeric_transformer, numeric_features)
    ]

preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [13]:
# Fit and transform
preprocessor.fit(X)
X_processed = preprocessor.transform(X)

In [14]:
# Get feature names
feature_names = preprocessor.get_feature_names_out()
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
X_processed_df['Churn'] = y.values


In [15]:

#this file is saved in the git repo under data folder
X_processed_df.to_csv('telco-customer-churn-Scale-PROCESSED.csv', index=False)

from google.colab import files
files.download('telco-customer-churn-Scale-PROCESSED.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>