In [1]:
#Importing required libraries

import pandas as pd
import numpy as np

#importing trai.csv
train_df = pd.read_csv("train.csv")

### **Data Preprocessing:** *Encoding Categorical variables*
> *Machine learning models prefer numerical data. Categorical data, often in text format, needs conversion to numerical form for compatibility. This allows the model to understand and process the data for effective learning.*

In [22]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, Normalizer
import pandas as pd

# List of categorical columns to be one-hot encoded
categorical_columns = [
    'PaymentMethod', 'PaperlessBilling',
    'ContentType', 'MultiDeviceAccess', 'DeviceRegistered',
    'GenrePreference', 'Gender', 'ParentalControl', 'SubtitlesEnabled'
]

# One-hot encode the categorical columns
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = onehot_encoder.fit_transform(train_df[categorical_columns])

# Convert the categorical encoded data into a DataFrame with column names
encoded_categorical_df = pd.DataFrame(
    encoded_categorical,
    columns=onehot_encoder.get_feature_names_out()
)

In [23]:
# Define the order of the categories
subscription_type_ordering = ['Basic', 'Standard', 'Premium']

# Create the OrdinalEncoder object with the specified ordering
ordinal_encoder = OrdinalEncoder(categories=[subscription_type_ordering])

# Fit and transform the data
train_df['SubscriptionTypeEncoded'] = ordinal_encoder.fit_transform(train_df[['SubscriptionType']])

# Drop the original categorical columns and concatenate the one-hot encoded columns
train_df_new = train_df.drop(categorical_columns + ['SubscriptionType'], axis=1)
train_df_encoded = pd.concat([train_df_new, encoded_categorical_df], axis=1)

### **Numerical Scaling (Data Transformation):** Since the data does not follow  Gaussian distribution. 
>  *Feature scaling in machine learning prevents features with larger values from dominating the model. This improves model training by creating a smoother learning process and reducing the risk of overfitting.*# List of numerical columns to be normalized


In [24]:
# List of numerical columns to be normalized
numerical_columns = [
    'AccountAge', 'MonthlyCharges', 'TotalCharges',
    'ViewingHoursPerWeek', 'AverageViewingDuration',
    'ContentDownloadsPerMonth', 'UserRating', 'SupportTicketsPerMonth',
    'WatchlistSize'
]

# Create the Normalizer object
normalizer = Normalizer()

# Fit and transform the numerical columns
train_df_encoded[numerical_columns] = normalizer.fit_transform(train_df_encoded[numerical_columns])

In [25]:
#Cheching the distribution of predictive variables features
train_df.groupby('Churn')["AccountAge"].count()

Churn
0    199605
1     44182
Name: AccountAge, dtype: int64

In [26]:
# Dropping CustomerID column as it has no relevance with predictive variable
transformed_df = train_df_encoded.drop(['CustomerID'], axis=1)

In [27]:
# Creating X and y as features and target variables
X = transformed_df.drop(['Churn'], axis = 1)
Y = transformed_df.pop('Churn')

In [28]:
from imblearn.over_sampling import ADASYN

# Create the ADASYN object
ada = ADASYN(random_state=42)

# Fit and resample the data
X_resampled, y_resampled = ada.fit_resample(X, Y)

# Counting the values for each class after resampling
class_counts = y_resampled.value_counts()
print(class_counts)

Churn
0    199605
1    192261
Name: count, dtype: int64


In [29]:
# Create a new DataFrame from the resampled data
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df['Churn'] = y_resampled

In [34]:
balanced_df.to_csv("A:/DA_DS_BA/Projects/Churn Prediction Challenge/Churn_Streamlit/balanced_df.csv", index=False)