<a href="https://colab.research.google.com/github/vixxypsycho/stage-C/blob/main/stage_c_quiz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as pyo
import plotly.express as px

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# read data
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.shape

(7043, 21)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [10]:
#Check the number of missing values for "TotalCharges" column
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [11]:
# Change datatypes for TotalCharges
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')


In [12]:
df.isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [13]:
#Fill the missing values with 0
df["TotalCharges"].fillna(0, inplace=True)

#Check to confirm that there are no missing values for "TotalCharges" column
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [14]:
#Select "Churn" column
df.Churn = df.drop(columns=['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
'MonthlyCharges', 'TotalCharges'])
df.Churn.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

In [15]:
#Convert the elements in the "Churn" column to binary values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.Churn = le.fit_transform(df.Churn)

In [16]:
#Check
df.Churn

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [17]:
#Select the feature and target variables
X = df.drop(columns = 'Churn')
y = df.Churn

In [18]:
#split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
y_train.value_counts()

Churn
0    3589
1    1341
Name: count, dtype: int64

In [19]:
#Categorical features
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
categorical_df = df[categorical_cols]
categorical_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic)
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check
7039,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic)
7040,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check
7041,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check


In [20]:
#Numerical features
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
numerical_df = df[numerical_cols]
numerical_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.50
2,2,53.85,108.15
3,45,42.30,1840.75
4,2,70.70,151.65
...,...,...,...
7038,24,84.80,1990.50
7039,72,103.20,7362.90
7040,11,29.60,346.45
7041,4,74.40,306.60


In [21]:
numerical_df.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [22]:
#The numerical features should be scaled using StandardScaler,
#convert the output back to a dataframe and put back the column names.

#Standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_numerical_df = scaler.fit_transform(numerical_df)
normalized_numerical_df_new = pd.DataFrame(normalized_numerical_df, columns = numerical_df.columns)

normalized_numerical_df_new.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.36266,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874


In [23]:
#The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false),
#convert the output back to a dataframe and put back the column names.

from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder with sparse_output=False
encoder = OneHotEncoder(sparse=False)

# Fit and transform the categorical features
one_hot_encoded = encoder.fit_transform(df[categorical_cols])

# Create a new DataFrame with the one-hot encoded result
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(input_features = categorical_cols))

print(one_hot_df)


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



      gender_Female  gender_Male  SeniorCitizen_0  SeniorCitizen_1  \
0               1.0          0.0              1.0              0.0   
1               0.0          1.0              1.0              0.0   
2               0.0          1.0              1.0              0.0   
3               0.0          1.0              1.0              0.0   
4               1.0          0.0              1.0              0.0   
...             ...          ...              ...              ...   
7038            0.0          1.0              1.0              0.0   
7039            1.0          0.0              1.0              0.0   
7040            1.0          0.0              1.0              0.0   
7041            0.0          1.0              0.0              1.0   
7042            0.0          1.0              1.0              0.0   

      Partner_No  Partner_Yes  Dependents_No  Dependents_Yes  PhoneService_No  \
0            0.0          1.0            1.0             0.0              1.0 

In [24]:
#Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

# Combine the DataFrames for train and test sets
train_combined_df = pd.concat([normalized_numerical_df_new, one_hot_df], axis=1)
train_combined_df.head()


#test_combined = pd.concat([test_scaled_df, test_encoded_df], axis=1)


# Top event actors
#top_actors = pd.concat([df['actor1'], df['actor2']]).value_counts().head(10)
#print(top_actors)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.172165,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.36266,-0.958066,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.193672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.938874,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [25]:
train_combined_df.shape

(7043, 46)

In [26]:
X = train_combined_df

In [27]:
X.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.172165,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.36266,-0.958066,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.193672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.938874,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [28]:
# Split train_combined_df into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [29]:
#Question 14: What is the accuracy on the test set using the random forest classifier?

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=1)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)



In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8507    0.8756    0.8630      1061
           1     0.5836    0.5316    0.5564       348

    accuracy                         0.7906      1409
   macro avg     0.7172    0.7036    0.7097      1409
weighted avg     0.7848    0.7906    0.7873      1409



In [33]:
#Question 15: What is the accuracy on the test set using the xgboost classifier?
#xgboost
from xgboost import XGBClassifier
extreme = XGBClassifier(random_state=1)
extreme.fit(X_train, y_train)
y_pred = extreme.predict(X_test)


In [34]:
#Classification
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8571    0.8709    0.8640      1061
           1     0.5861    0.5575    0.5714       348

    accuracy                         0.7935      1409
   macro avg     0.7216    0.7142    0.7177      1409
weighted avg     0.7902    0.7935    0.7917      1409



In [35]:
#Question 16: What is the accuracy on the test set using the LGBM classifier?
#lightgbm
from lightgbm import LGBMClassifier
light = LGBMClassifier(random_state=1)
light.fit(X_train, y_train)
y_pred = light.predict(X_test)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [36]:
#Classification
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8688    0.8860    0.8773      1061
           1     0.6300    0.5920    0.6104       348

    accuracy                         0.8133      1409
   macro avg     0.7494    0.7390    0.7438      1409
weighted avg     0.8098    0.8133    0.8114      1409



In [37]:
#Question 17

#Hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [38]:
#Randomised Search Cross Validation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

tree2 = ExtraTreesClassifier(random_state=1)
clf = RandomizedSearchCV(tree2, hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state=1)
search_result = clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [39]:
#checking for the best parameter for the model
search_result.best_params_

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'sqrt'}

In [40]:
#Question 18: Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV
#(with random_state = 1). Is the accuracy of the new optimal model higher or lower
#than the initial ExtraTreesClassifier model with no hyperparameter tuning?

tuned_tree = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2,
                                 min_samples_leaf=8, max_features=None, random_state=1)
tuned_tree.fit(X_train, y_train)
tuned_tree_pred = tuned_tree.predict(X_test)

In [41]:
#classification report for this hyperparameter tuning
print(classification_report(y_test, tuned_tree_pred, digits=4))


              precision    recall  f1-score   support

           0     0.8629    0.8841    0.8734      1061
           1     0.6180    0.5718    0.5940       348

    accuracy                         0.8070      1409
   macro avg     0.7405    0.7280    0.7337      1409
weighted avg     0.8024    0.8070    0.8044      1409

