In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

df = pd.read_csv('data.csv')

df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.dropna(how='any', inplace=True)
X=df.drop(['customerID', 'Churn'], axis=1)
y=df.Churn.values

# Convert categorical features to numericals --> Feature Encoding --> Dummy Encoding

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [38]:
# Call the DT Classifier
from sklearn.tree import DecisionTreeClassifier

# Initiating the classifier
model_dt = DecisionTreeClassifier()

# Passing the data to classifier
model_dt.fit(X_train_sc, y_train)

y_pred_dt = model_dt.predict(X_test_sc)

print(accuracy_score(y_test,y_pred_dt)*100)

71.10352673492605


In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_dt))

              precision    recall  f1-score   support

          No       0.81      0.79      0.80      1288
         Yes       0.46      0.48      0.47       470

    accuracy                           0.71      1758
   macro avg       0.63      0.64      0.64      1758
weighted avg       0.72      0.71      0.71      1758



In [40]:
from imblearn.over_sampling import RandomOverSampler

# Upsample the minority class using RandomOverSampler
oversampler = RandomOverSampler()
X_train_upsampled, y_train_upsampled = oversampler.fit_resample(X_train_sc, y_train)

# Model Building
model_dt2 = DecisionTreeClassifier()
model_dt2.fit(X_train_upsampled, y_train_upsampled)

# Predict on the test set
y_pred_dt2 = model_dt2.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt2) * 100
print("Accuracy:", accuracy)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_dt2))


Accuracy: 72.75312855517633
              precision    recall  f1-score   support

          No       0.81      0.82      0.81      1288
         Yes       0.49      0.49      0.49       470

    accuracy                           0.73      1758
   macro avg       0.65      0.65      0.65      1758
weighted avg       0.73      0.73      0.73      1758



In [41]:
y_train_upsampled

array(['No', 'Yes', 'No', ..., 'Yes', 'Yes', 'Yes'],
      shape=(7750,), dtype=object)

In [42]:
from imblearn.under_sampling import RandomUnderSampler

# Downsample the majority class using RandomUnderSampler
downsampler = RandomUnderSampler()
X_train_downsampled, y_train_downsampled = downsampler.fit_resample(X_train_sc, y_train)

# Model Building
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_downsampled, y_train_downsampled)

# Predict on the test set
y_pred_dt3 = model_dt3.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt3) * 100
print("Accuracy:", accuracy)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_dt3))


Accuracy: 67.57679180887372
              precision    recall  f1-score   support

          No       0.85      0.68      0.75      1288
         Yes       0.43      0.66      0.52       470

    accuracy                           0.68      1758
   macro avg       0.64      0.67      0.64      1758
weighted avg       0.74      0.68      0.69      1758



In [43]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
#SMOTEENN: SMOTE + ENN: Synthetic Minority Over Sampling Technique + Edited Nearest Neighbors
from sklearn.metrics import accuracy_score, classification_report

# Create a SMOTEENN object
smote_enn = SMOTEENN(random_state=42)

# Resample the data
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_sc, y_train)

# Model Building
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_dt3 = model_dt3.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt3) * 100
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred_dt3))


Accuracy: 72.58248009101251
              precision    recall  f1-score   support

          No       0.89      0.72      0.79      1288
         Yes       0.49      0.75      0.59       470

    accuracy                           0.73      1758
   macro avg       0.69      0.73      0.69      1758
weighted avg       0.78      0.73      0.74      1758



In [44]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
#SMOTE: Synthetic Minority Over Sampling Technique
from sklearn.metrics import accuracy_score, classification_report

# Create a SMOTE object
smote = SMOTE(random_state=42)

# Resample the data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sc, y_train)

# Model Building
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_dt3 = model_dt3.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt3) * 100
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred_dt3))


Accuracy: 70.36405005688282
              precision    recall  f1-score   support

          No       0.82      0.76      0.79      1288
         Yes       0.45      0.54      0.49       470

    accuracy                           0.70      1758
   macro avg       0.64      0.65      0.64      1758
weighted avg       0.72      0.70      0.71      1758



In [45]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import ADASYN
#ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning
from sklearn.metrics import accuracy_score, classification_report

# Create an ADASYN object
adasyn = ADASYN(random_state=42)

# Resample the data
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_sc, y_train)

# Model Building
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_dt3 = model_dt3.predict(X_test_sc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_dt3) * 100
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred_dt3))


Accuracy: 71.95676905574516
              precision    recall  f1-score   support

          No       0.83      0.78      0.80      1288
         Yes       0.48      0.56      0.51       470

    accuracy                           0.72      1758
   macro avg       0.65      0.67      0.66      1758
weighted avg       0.73      0.72      0.73      1758



In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import AllKNN
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assume 'df' is your dataframe containing the dataset

# Separate features and target variable
X = df.drop(['customerID', 'Churn'], axis=1)  # Replace with your target column name
y = df["Churn"]

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)  # Stratified split for imbalanced data

# Check for class imbalance in training data
print("Class imbalance in training data:")
print(y_train.value_counts())

# Use AllKNN for undersampling
allknn = AllKNN(sampling_strategy='auto')
X_train_resampled, y_train_resampled = allknn.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Create a decision tree classifier with appropriate parameters
print("Training decision tree...")
dtc = DecisionTreeClassifier(random_state=42, max_depth=5)  # Adjust parameters as needed
dtc.fit(X_train_scaled, y_train_resampled)

# Make predictions on the test set
y_pred = dtc.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree:", accuracy * 100, "%")

# Print classification report
print(classification_report(y_test, y_pred))


Class imbalance in training data:
Churn
No     3872
Yes    1402
Name: count, dtype: int64
Training decision tree...
Accuracy of decision tree: 73.54948805460751 %
              precision    recall  f1-score   support

          No       0.91      0.71      0.80      1291
         Yes       0.50      0.81      0.62       467

    accuracy                           0.74      1758
   macro avg       0.71      0.76      0.71      1758
weighted avg       0.80      0.74      0.75      1758



In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assume 'df' is your dataframe containing the dataset

# Separate features and target variable
X = df.drop(['customerID', 'Churn'], axis=1)  # Replace with your target column name
y = df["Churn"]

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)  # Stratified split for imbalanced data

# Check for class imbalance in training data
print("Class imbalance in training data:")
print(y_train.value_counts())

# Use TomekLinks for undersampling
tomek_links = TomekLinks(sampling_strategy='majority')  # not majority gives better results
X_train_resampled, y_train_resampled = tomek_links.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Create a decision tree classifier
dtc = DecisionTreeClassifier(random_state=42)

# Define the parameter grid for fine-tuning
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for fine-tuning
grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train_scaled, y_train_resampled)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_params)

# Create a decision tree classifier with the best parameters
dtc_best = DecisionTreeClassifier(random_state=42, **best_params)
dtc_best.fit(X_train_scaled, y_train_resampled)

# Make predictions on the test set
y_pred = dtc_best.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree:", accuracy * 100, "%")

# Print classification report
print(classification_report(y_test, y_pred))


Class imbalance in training data:
Churn
No     3872
Yes    1402
Name: count, dtype: int64
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy of decision tree: 78.32764505119454 %
              precision    recall  f1-score   support

          No       0.86      0.85      0.85      1291
         Yes       0.59      0.61      0.60       467

    accuracy                           0.78      1758
   macro avg       0.72      0.73      0.72      1758
weighted avg       0.79      0.78      0.78      1758



In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assume 'df' is your dataframe containing the dataset

# Separate features and target variable
X = df.drop(['customerID', 'Churn'], axis=1)  # Replace with your target column name
y = df["Churn"]

X = pd.get_dummies(X, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)  # Stratified split for imbalanced data

# Check for class imbalance in training data
print("Class imbalance in training data:")
print(y_train.value_counts())

# Use SMOTETomek for both over-sampling and under-sampling
smote_tomek = SMOTETomek(sampling_strategy=0.5, random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Create a decision tree classifier with appropriate parameters
print("Training decision tree...")
dtc = DecisionTreeClassifier(random_state=42, max_depth=5)  # Adjust parameters as needed
dtc.fit(X_train_scaled, y_train_resampled)

# Make predictions on the test set
y_pred = dtc.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree:", accuracy * 100, "%")

# Print classification report
print(classification_report(y_test, y_pred))


Class imbalance in training data:
Churn
No     3872
Yes    1402
Name: count, dtype: int64
Training decision tree...
Accuracy of decision tree: 78.15699658703072 %
              precision    recall  f1-score   support

          No       0.86      0.84      0.85      1291
         Yes       0.58      0.61      0.60       467

    accuracy                           0.78      1758
   macro avg       0.72      0.73      0.72      1758
weighted avg       0.79      0.78      0.78      1758

