# Modeling

### Import Libaries

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate,cross_val_score, train_test_split, GridSearchCV, learning_curve
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import datetime
from sklearn.compose import ColumnTransformer
import time
from datetime import datetime

In [2]:
### Load Data

In [3]:
datafilepath = "../data/interim/train_test_split.pkl"

with open(datafilepath, 'rb') as file:
    X,y,X_train, X_test, y_train, y_test = pickle.load(file)

In [4]:
X_train

Unnamed: 0,CustomerID,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,CustomerCareCalls,IncomeGroup,Cluster,...,CityCode_SEW,CityCode_SFR,CityCode_SFU,CityCode_SHE,CityCode_SLC,CityCode_SLU,CityCode_STL,CityCode_VAH,ChildrenInHH_No,ChildrenInHH_Yes
39797,3317186,34.71,123.0,33.0,0.00,0.0,0.0,0.0,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
31981,3254802,35.60,852.0,33.0,0.00,27.0,0.0,7.7,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
31916,3254278,60.09,96.0,60.0,2.23,0.0,5.4,2.7,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39880,3317810,50.31,772.0,60.0,0.00,0.0,0.3,0.0,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23187,3182418,12.68,619.0,10.0,1.98,3.0,0.0,2.7,6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,3088986,10.25,83.0,10.0,0.25,0.0,0.0,0.0,9,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
44732,3354122,49.99,362.0,60.0,0.00,0.0,0.0,4.3,7,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
38158,3304174,34.99,204.0,45.0,0.00,0.0,0.0,0.0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
860,3006874,181.75,1578.0,110.0,0.00,232.0,0.5,2.0,4,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
### Initiation

In [6]:
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

0.2891740333521072

In [7]:
#Calculate the medium of `y_train`
X_defaults_median = X_train.median()
X_defaults_median

CustomerID               3201114.00
MonthlyRevenue                48.46
MonthlyMinutes               366.00
TotalRecurringCharge          45.00
DirectorAssistedCalls          0.25
                            ...    
CityCode_SLU                   0.00
CityCode_STL                   0.00
CityCode_VAH                   0.00
ChildrenInHH_No                1.00
ChildrenInHH_Yes               0.00
Length: 80, dtype: float64

#### Impute NaN Data

In [8]:
# Check for NaN values
print("NaN values in X_test:", X_test.isna().sum().sum())

# Replace NaN values with the mean (you can choose other methods)
X_test = X_test.fillna(X_test.mean())

# Check for infinity values
print("Infinity values in X_test:", np.isinf(X_test).sum().sum())

# Replace infinity values
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_test.mean())

NaN values in X_test: 0
Infinity values in X_test: 0


#### Normalize or Scale your data

In [9]:
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
X_test

array([[-0.74307993, -0.53855413, -0.3950575 , ..., -0.02970297,
         0.57772725, -0.57772725],
       [-1.38364243, -0.63785923, -0.69964258, ..., -0.02970297,
         0.57772725, -0.57772725],
       [-1.21120949,  0.05951812, -0.36857184, ..., -0.02970297,
        -1.73092059,  1.73092059],
       ...,
       [ 1.10260631,  0.28076898, -0.1302009 , ..., -0.02970297,
        -1.73092059,  1.73092059],
       [ 0.50453914,  0.23391849,  0.30681248, ..., -0.02970297,
        -1.73092059,  1.73092059],
       [ 1.21146652,  0.58406424,  1.41921018, ..., -0.02970297,
         0.57772725, -0.57772725]])