# Lab - Cross Validation

In [86]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings('ignore')

# Instructions

#### 1.Apply SMOTE for upsampling the data

* Use logistic regression to fit the model and compute the accuracy of the model.
* Use decision tree classifier to fit the model and compute the accuracy of the model.
* Compare the accuracies of the two models.

#### 2.Apply TomekLinks for downsampling

--> It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.

* Use logistic regression to fit the model and compute the accuracy of the model.
* Use decision tree classifier to fit the model and compute the accuracy of the model.
* Compare the accuracies of the two models.
* You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [11]:
df=pd.read_csv(r"files_for_lab/Customer-churn.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [12]:
df['Churn'].value_counts() # As we can see, the data is imbalanced. 

No     5174
Yes    1869
Name: Churn, dtype: int64

In [13]:
col_names=df.columns.values.tolist()
for col in range(len(col_names)):
    col_names[col]=col_names[col].lower()
    col_names[col]=col_names[col].replace(" ", "_")

In [14]:
for i in range(len(col_names)):
    df.rename(columns={df.columns.values[i]:col_names[i]},inplace=True)

**1. We are going to apply SMOTE to balance the data**

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   onlinesecurity    7043 non-null   object 
 7   onlinebackup      7043 non-null   object 
 8   deviceprotection  7043 non-null   object 
 9   techsupport       7043 non-null   object 
 10  streamingtv       7043 non-null   object 
 11  streamingmovies   7043 non-null   object 
 12  contract          7043 non-null   object 
 13  monthlycharges    7043 non-null   float64
 14  totalcharges      7043 non-null   object 
 15  churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

In [21]:
df["seniorcitizen"].value_counts()

0    5901
1    1142
Name: seniorcitizen, dtype: int64

In [17]:
df.isnull().sum() # no null values at all

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

The next step will be to standarize the numerical variables and to encode the categoricals.
The numerical ones will be **monthlycharges, totalcharges** and **tenure** as the **seniorization** variable just has 0 and 1 values. In addition, the **totalcharges** is an object type we will need to convert it to *float* first in order to standarize it.

In [36]:
df = df[df.totalcharges != " "]

In [37]:
df["totalcharges"].value_counts()

20.20      11
19.75       9
19.65       8
20.05       8
19.90       8
           ..
1066.15     1
249.95      1
8333.95     1
7171.70     1
1024.00     1
Name: totalcharges, Length: 6530, dtype: int64

In [38]:
df['totalcharges'] = df['totalcharges'].astype(float)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   seniorcitizen     7032 non-null   int64  
 2   partner           7032 non-null   object 
 3   dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   phoneservice      7032 non-null   object 
 6   onlinesecurity    7032 non-null   object 
 7   onlinebackup      7032 non-null   object 
 8   deviceprotection  7032 non-null   object 
 9   techsupport       7032 non-null   object 
 10  streamingtv       7032 non-null   object 
 11  streamingmovies   7032 non-null   object 
 12  contract          7032 non-null   object 
 13  monthlycharges    7032 non-null   float64
 14  totalcharges      7032 non-null   float64
 15  churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [40]:
df_num=df.select_dtypes([np.number])

In [42]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   seniorcitizen   7032 non-null   int64  
 1   tenure          7032 non-null   int64  
 2   monthlycharges  7032 non-null   float64
 3   totalcharges    7032 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 274.7 KB


In [67]:
df_cat=df.drop(columns=df_num.columns)
df_cat1=df_cat.drop(columns='churn')

In [68]:
y=df_cat['churn']

In [70]:
df_cat1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            7032 non-null   object
 1   partner           7032 non-null   object
 2   dependents        7032 non-null   object
 3   phoneservice      7032 non-null   object
 4   onlinesecurity    7032 non-null   object
 5   onlinebackup      7032 non-null   object
 6   deviceprotection  7032 non-null   object
 7   techsupport       7032 non-null   object
 8   streamingtv       7032 non-null   object
 9   streamingmovies   7032 non-null   object
 10  contract          7032 non-null   object
dtypes: object(11)
memory usage: 659.2+ KB


In [71]:
scaler=StandardScaler()

In [72]:
scaled=scaler.fit_transform(df_num)

In [73]:
df_num_st=pd.DataFrame(scaled,columns=df_num.columns)

In [74]:
df_num_st.head(3)

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges
0,-0.440327,-1.280248,-1.161694,-0.994194
1,-0.440327,0.064303,-0.260878,-0.17374
2,-0.440327,-1.239504,-0.363923,-0.959649


Next step will be to encode the categorical variables:

In [56]:
encoder=OneHotEncoder(handle_unknown='error', drop='first')

encoded_data=pd.DataFrame(encoder.fit_transform(df_cat).toarray())
encoded_data.head()

In [75]:
encoder.fit(df_cat1)

In [76]:
encoded = encoder.transform(df_cat1).toarray()

In [77]:
X_concat=np.concatenate([df_num_st,encoded], axis=1)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_concat, y, test_size=0.3, random_state=42)

The next step will be to balance the data with **SMOTE**

In [80]:
smote=SMOTE()

In [84]:
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
y_train_sm.value_counts()

Yes    3614
No     3614
Name: churn, dtype: int64

By applying *logistic regression*:

In [85]:
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train_sm, y_train_sm)

y_sm_predictions = classification.predict(X_test)
print(classification_report(y_test, y_sm_predictions))

              precision    recall  f1-score   support

          No       0.90      0.72      0.80      1549
         Yes       0.51      0.79      0.62       561

    accuracy                           0.74      2110
   macro avg       0.71      0.76      0.71      2110
weighted avg       0.80      0.74      0.75      2110



We are going to make the same calculation but now we are going to use **Decision Trees**

In [112]:
clf_model = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=4, min_samples_leaf=5)   
clf_model.fit(X_train_sm,y_train_sm)


In [113]:
y_predict_dt = clf_model.predict(X_test)

In [114]:
accuracy_score(y_test,y_predict_dt)

0.7260663507109004

In [111]:
print(classification_report(y_test, y_predict_dt))

              precision    recall  f1-score   support

          No       0.89      0.70      0.78      1549
         Yes       0.48      0.76      0.59       561

    accuracy                           0.72      2110
   macro avg       0.68      0.73      0.69      2110
weighted avg       0.78      0.72      0.73      2110



- The accuracies and the metrics for this dataset are very similar if we compare the obtained results with Logistic Regression and with Decision Trees. Even though, in general, the Logistic Regression method obtains slightly better results.

**2. Applying TokeLinks downsampling**

In [116]:
tomek = TomekLinks()

In [117]:
X_train_tl,y_train_tl=tomek.fit_resample(X_train,y_train)

In [118]:
y_train_tl.value_counts()

No     3258
Yes    1308
Name: churn, dtype: int64

In [119]:
classification_lr=LogisticRegression(random_state=42, max_iter=10000)
classification_lr.fit(X_train_tl,y_train_tl)

In [122]:
y_pred_tl=classification_lr.predict(X_test)

In [124]:
accuracy_score(y_test,y_pred_tl)

0.7881516587677725

In [125]:
print(classification_report(y_test, y_pred_tl))

              precision    recall  f1-score   support

          No       0.86      0.84      0.85      1549
         Yes       0.60      0.63      0.61       561

    accuracy                           0.79      2110
   macro avg       0.73      0.74      0.73      2110
weighted avg       0.79      0.79      0.79      2110



After we are going to make the calculation using **Decision Trees**

In [157]:
clf_model_tl = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=3, min_samples_leaf=5) 
clf_model_tl.fit(X_train_tl,y_train_tl)

In [158]:
y_predict_dt_tl = clf_model.predict(X_test)

In [159]:
accuracy_score(y_test,y_predict_dt_tl)

0.7834123222748816

In [160]:
print(classification_report(y_test, y_predict_dt_tl))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1549
         Yes       0.63      0.46      0.53       561

    accuracy                           0.78      2110
   macro avg       0.72      0.68      0.69      2110
weighted avg       0.77      0.78      0.77      2110



Accuracies for both models (Logistic Regression and Decision Trees) are almost the same using TokeLinks. On the other hand,
the Logistic Regression gets better "recall" and "f1-score" values when predicting "Yes" and worse when predicting "No". Regarding "precision", the calculated values are very similar.