### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
data1 = pd.read_csv('upsampled_data.csv')  #importing upsampled(via SMOTE) dataset from previous lab

In [3]:
data1.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
0,-1.280248,0,0.115423,-0.994194,No
1,0.064303,0,0.385075,-0.17374,No
2,-1.239504,0,0.354229,-0.959649,Yes
3,0.512486,0,0.239303,-0.195248,No
4,-1.239504,0,0.521891,-0.940457,Yes


In [4]:
data2= pd.read_csv('downsampled_data.csv') #importing downsampled (via TOMEKlinks) dataset from previous lab

In [5]:
data2.head()

Unnamed: 0,tenure,TotalCharges,SeniorCitizen,MonthlyCharges,Churn
0,0.0,0.001275,0.0,0.115423,No
1,0.464789,0.215867,0.0,0.385075,No
2,0.014085,0.01031,0.0,0.354229,Yes
3,0.619718,0.210241,0.0,0.239303,No
4,0.014085,0.01533,0.0,0.521891,Yes


### 1. Apply SMOTE for upsampling the data

In [7]:
X_upsampled = data1.drop('Churn', axis=1) #already done in the previous lab
y_upsampled = data1['Churn']

### 2. Use logistic regression to fit the model and compute the accuracy of the model.

In [8]:
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

In [10]:
logreg_up = LogisticRegression()
logreg_up.fit(X_train_up, y_train_up)b
y_pred_up = logreg_up.predict(X_test_up)
accuracy_smotw = accuracy_score(y_test_up, y_pred_up)
print("Accuracy (Logistic Regression - Smotified from previous lab):", accuracy_up)

Accuracy (Logistic Regression - SMOTE from previous lab): 0.7434656340755083


In [21]:
print(classification_report(y_test_up, y_pred_up))

              precision    recall  f1-score   support

          No       0.75      0.74      0.74      1037
         Yes       0.74      0.75      0.74      1029

    accuracy                           0.74      2066
   macro avg       0.74      0.74      0.74      2066
weighted avg       0.74      0.74      0.74      2066



### 3. Use decision tree classifier to fit the model and compute the accuracy of the model.

In [11]:
dt_up = DecisionTreeClassifier()
dt_up.fit(X_train_up, y_train_up)
y_pred_dt_up = dt_up.predict(X_test_up)
accuracy_dt_up = accuracy_score(y_test_up, y_pred_dt_up)
print("Accuracy (Decision Tree - Smotified from previous lab):", accuracy_dt_up)

Accuracy (Decision Tree - Upsampled): 0.7507260406582769


In [22]:
print(classification_report(y_test_up, y_pred_dt_up))

              precision    recall  f1-score   support

          No       0.75      0.75      0.75      1037
         Yes       0.75      0.76      0.75      1029

    accuracy                           0.75      2066
   macro avg       0.75      0.75      0.75      2066
weighted avg       0.75      0.75      0.75      2066



### 4. Compare the accuracies of the two models.

In [19]:
print("Accuracy Comparison - Smotified")
print("Logistic Regression Accuracy:", accuracy_up)
print("Decision Tree Accuracy:", accuracy_dt_up)

Accuracy Comparison - Smotified
Logistic Regression Accuracy: 0.7434656340755083
Decision Tree Accuracy: 0.7507260406582769


### The results of both the logistic regression and decision tree classifier are both similar Post-SMOTE. However, the logistic regression model is marginally better at predicting Yes (the minority sample).

### 5. Apply TomekLinks for downsampling

In [12]:
X_downsampled= data2.drop('Churn', axis=1)
y_downsampled=data2['Churn']

### 6. Use logistic regression to fit the model and compute the accuracy of the model.

In [13]:
X_train_down, X_test_down, y_train_down, y_test_down = train_test_split(X_downsampled, y_downsampled, test_size=0.2, random_state=42)


In [24]:
logreg_down = LogisticRegression()
logreg_down.fit(X_train_down, y_train_down)
y_pred_down = logreg_down.predict(X_test_down)
accuracy_down = accuracy_score(y_test_down, y_pred_down)
print("Accuracy (Logistic Regression - TOMEKLinkified from previous lab):", accuracy_down)


Accuracy (Logistic Regression - TOMEKLinkified from previous lab): 0.7966231772831927


In [23]:
print(classification_report(y_test_down, y_pred_down))

              precision    recall  f1-score   support

          No       0.82      0.91      0.86       926
         Yes       0.70      0.52      0.60       377

    accuracy                           0.80      1303
   macro avg       0.76      0.72      0.73      1303
weighted avg       0.79      0.80      0.79      1303



### 7. Use decision tree classifier to fit the model and compute the accuracy of the model.

In [16]:
dt_down = DecisionTreeClassifier()
dt_down.fit(X_train_down, y_train_down)
y_pred_dt_down = dt_down.predict(X_test_down)
accuracy_dt_down = accuracy_score(y_test_down, y_pred_dt_down)
print("Accuracy (Decision Tree - TOMEKLinkified from previous lab):", accuracy_dt_down)

Accuracy (Decision Tree - TOMEKLinkified from previous lab): 0.7674597083653109


In [25]:
print(classification_report(y_test_down, y_pred_dt_down))

              precision    recall  f1-score   support

          No       0.83      0.84      0.84       926
         Yes       0.60      0.59      0.59       377

    accuracy                           0.77      1303
   macro avg       0.72      0.71      0.72      1303
weighted avg       0.77      0.77      0.77      1303



### 8. Compare the accuracies of the two models.

In [20]:
print("Accuracy Comparison - TOMEKLinkified")
print("Logistic Regression Accuracy:", accuracy_down)
print("Decision Tree Accuracy:", accuracy_dt_down)

Accuracy Comparison - TOMEKLinkified
Logistic Regression Accuracy: 0.7966231772831927
Decision Tree Accuracy: 0.7674597083653109


### Oddly enough, the accuracy of the the Decision Tree Model is lower than the Logistic Regression model. However, the Logistic Regression Model is markedly better at predicting Yes.