# 5) Data Preprocessing and Model Training

In [None]:
# importing package- pandas, numpy, matplotlib, seaborn and warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import warnings

Models used :
1. Linear Regression
2. Lasso
3. Ridge
4. K Neighbors Regressor
5. Decision Tree
6. Random Forest Regressor
7. XG Boost Regressor
8. Catboosting Regressor
9. Adaboost Regressor

In [None]:
# Modelling packages
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

Reading excel data as pandas dataframe

In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [7]:
df = pd.read_excel('customer_churn_large_dataset.xlsx')
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


Preparing X and Y variable

In [None]:
X = df.iloc[:, 2:8]
X.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,Male,Los Angeles,17,73.36,236
1,62,Female,New York,1,48.76,172
2,24,Female,Los Angeles,5,85.47,460
3,36,Female,Miami,3,97.94,297
4,46,Female,Miami,19,58.14,266


In [None]:
print("Categories in 'Gender' variable:    ",end=" " )
print(df['Gender'].unique())

print("Categories in 'Location' variable:  ",end=" ")
print(df['Location'].unique())

Categories in 'Gender' variable:     ['Male' 'Female']
Categories in 'Location' variable:   ['Los Angeles' 'New York' 'Miami' 'Chicago' 'Houston']


In [None]:
Y = df.iloc[:, 8:9]
Y.head()

Unnamed: 0,Churn
0,0
1,0
2,0
3,1
4,0


In [None]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", cat_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [None]:
X = preprocessor.fit_transform(X)

In [None]:
X.shape

(100000, 11)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((80000, 11), (20000, 11))

Metrics Evaluation Function

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, Y_train)             #train the model

    #make predictions
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)

    #evaluating model
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(Y_train, Y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(Y_test, Y_test_pred)

    #pritning the results
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('-'*35)

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')




Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.5000
- Mean Absolute Error: 0.4999
- R2 Score: 0.0001
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5000
- Mean Absolute Error: 0.5000
- R2 Score: -0.0002


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.5000
- Mean Absolute Error: 0.5000
- R2 Score: 0.0000
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5000
- Mean Absolute Error: 0.5000
- R2 Score: -0.0000


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.5000
- Mean Absolute Error: 0.4999
- R2 Score: 0.0001
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5000
- Mean Absolute Error: 0.5000
- R2 Score: -0.0001


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.4472
- Mean Absolute Error: 0.3997
- R2 Score: 0.1999
-----------------

  return fit_method(estimator, *args, **kwargs)


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.1915
- Mean Absolute Error: 0.1839
- R2 Score: 0.8533
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5125
- Mean Absolute Error: 0.5001
- R2 Score: -0.0508




  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 0.4681
- Mean Absolute Error: 0.4603
- R2 Score: 0.1237
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5081
- Mean Absolute Error: 0.5003
- R2 Score: -0.0325


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 0.4833
- Mean Absolute Error: 0.4803
- R2 Score: 0.0657
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5033
- Mean Absolute Error: 0.5006
- R2 Score: -0.0135


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.5001
- Mean Absolute Error: 0.4998
- R2 Score: -0.0004
-----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5004
- Mean Absolute Error: 0.5001
- R2 Score: -0.0016




  y = column_or_1d(y, warn=True)


In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
1,Lasso,-1.9e-05
2,Ridge,-0.000104
0,Linear Regression,-0.000211
8,AdaBoost Regressor,-0.001564
7,CatBoosting Regressor,-0.013473
6,XGBRegressor,-0.032525
5,Random Forest Regressor,-0.050777
3,K-Neighbors Regressor,-0.199443
4,Decision Tree,-1.001525
