<a href="https://colab.research.google.com/github/yj5x/AI-tasks/blob/main/task4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# ===================
#      libraries
#====================


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



In [None]:

# ===========================
#       uploading data
#============================
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/AI in business /WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = data.copy()

# cleaning
df.drop('customerID', axis=1, inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:

#========
#  EDA
#========

# distribution of Churn
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')

plt.subplot(1, 2, 2)
df['Churn'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title('Churn Ratio')
plt.show()

In [None]:
#=================
#   prepare data
#=================
X = df.drop('Churn', axis=1)
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
#===================
#    pretreatment
#===================

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)])

In [None]:

#=================================
#   training + evaluation models
#=================================

def train_and_evaluate(model, model_name):
    print(f"\n{'='*50}")
    print(f"Training and Evaluating {model_name}")
    print(f"{'='*50}")

    # training
    model.fit(X_train, y_train)

    # prediction
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # evaluation
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    plt.figure(figsize=(6, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

    print("\nROC Curve:")
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(6, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_proba):.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title(f'ROC Curve - {model_name}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

    # learning curve
    print("\nLearning Curve:")
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_train, y_train, cv=5, scoring='accuracy',
        train_sizes=np.linspace(0.1, 1.0, 5), n_jobs=-1)

    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, np.mean(train_scores, 1), 'o-', label='Training Score')
    plt.plot(train_sizes, np.mean(val_scores, 1), 'o-', label='Validation Score')
    plt.title(f'Learning Curve - {model_name}')
    plt.xlabel('Training Examples')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
#==========================
#    Logistic Regression
#==========================

# the model
pipeline_lr = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
train_and_evaluate(pipeline_lr, "Logistic Regression")

# with SMOTE
smote_pipeline_lr = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000))
])
train_and_evaluate(smote_pipeline_lr, "Logistic Regression with SMOTE")

# Hyperparameter
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2']
}
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_
train_and_evaluate(best_lr, "Tuned Logistic Regression")

In [None]:

#==========================
#      Random Forest
#==========================

# model
pipeline_rf = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
train_and_evaluate(pipeline_rf, "Random Forest")

# SMOTE
smote_pipeline_rf = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])
train_and_evaluate(smote_pipeline_rf, "Random Forest with SMOTE")

# Hyperparameter
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10]
}
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
train_and_evaluate(best_rf, "Tuned Random Forest")

In [None]:

#==========
#   SVM
#==========

# model
pipeline_svc = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', SVC(probability=True, random_state=42))
])
train_and_evaluate(pipeline_svc, "SVM")

# SMOTE
smote_pipeline_svc = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', SVC(probability=True, random_state=42))
])
train_and_evaluate(smote_pipeline_svc, "SVM with SMOTE")

# Hyperparameter
param_grid_svc = {
    'classifier__C': [0.1, 1],
    'classifier__kernel': ['linear', 'rbf']
}
grid_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='accuracy', n_jobs=-1)
grid_svc.fit(X_train, y_train)
best_svc = grid_svc.best_estimator_
train_and_evaluate(best_svc, "Tuned SVM")

In [None]:
#====================
# feature selection
#====================

# pretreatment
X_train_processed = preprocessor.fit_transform(X_train, y_train)

# feature names
processed_feature_names = preprocessor.get_feature_names_out()

# best 10 features
selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X_train_processed, y_train)



selected_features = processed_feature_names[selector.get_support()]
print("Selected Features:")
print(selected_features)

# 1. what is the difference between these plots countplot barplot histplot boxplot and scatterplot and when to use each type

countplot: shows how many in each category

barplot: shows average per category

histplot: shows how numbers are spread

boxplot: shows outlier values

scatterplot: show relation between two numbers





# 2. what step had the biggest impact on model performance

i take f1-score as a measure:

In logistic the use of pipeline was better

In random, pipeline and hypeeparameter were very close in results.

In SVM the performance decreased when i used the smote, but with the rest the performance did not change





# 3. how did hyperparameter tuning affect your results

In logistic:
Increased precision and recall

In random:
Increased percision
but recall decreased

In svm:
increased percision and recall





# 4. what trade offs did you encounter using pipelines

step of  EDA I had to shorten it to make the code easier and avoid repeating processes





# 5. how do we detect overfitting and underfitting

see if train score is high and test score is low its overfit

but if both is low its underfit





# 6. which plots are most useful for analyzing data distribution

I think histplot and countplot





# 7. when do we use labelencoder vs onehotencoder  and why

labelencoder is for data that has order or can be treated as ordered like (low, medium, high)

onehot is for no order like names





# 8. what does random_state do and when should we fix it

it control the random steps and we fix it if we want same result every time




# 9. what would change if the data were all floats or all integers

the model might not be affected much whether floats or integers, i think what really matters is proper feature scaling to ensure balanced model





# 10. which part of the pipeline was most impactful overall

Hyperparameter tuning after pipeline impacted performance most




# 11. how can unbalanced data be dealt with and which method did you apply in this project

i use smote and it helped to make the data more equal





# 12. did you check for duplicate records how many did you find and what did you do with them

yes i check after this question and saw some duplicate rows and i now i have to remove them from data





# 13. what is the difference between fit_transform and transform where did you apply each and why

fit_transform:
I used it on the training data to learn and apply the scaling

transform:
I used it on the test data to apply the same scaling, so the model doesn’t learn from the test data





# 14. what is the difference between  modelpredict and modelpredict_proba in which situations would you use each

predict gives the result like 0 or 1, so use it when you just need the result

predict_proba show the chance of decision odds before the final result, maybe use it when you want to see how confident the model is





# 15. what was your final best model pipeline and what made it the best

it was logistic with Hyperparameter, it give high score and good behavior at learning curve






# 16. how would you approach this task differently in a real world production setting


1. clean and prepare data to ensures correct model input  

2. use pipelines to keeps steps organized and repeatable  

3. monitor predictions of models to catches performance drops early  

4. and try to improve the model
