# preparation


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model
def predict_model(X_test, y_test, model):
    pred_test = model.predict(X_test)
    pred_test_proba = model.predict_proba(X_test)[:, 1]
    predictions_test.append(pred_test_proba)
    score = f1_score(y_test, pred_test)
    report = classification_report(y_test, pred_test, zero_division=1)
    return pred_test, pred_test_proba, score, report
def create_kfold_datasets(X, y, n_splits=5, shuffle=True, random_state=None):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    datasets = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        datasets.append((X_train, X_test, y_train, y_test))
    return datasets
def plotit(y_test, y_probs):
    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
    plt.plot(thresholds, precision[:-1], label='Precision')
    plt.plot(thresholds, recall[:-1], label='Recall')
    plt.plot(thresholds, 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1]), label='F1')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.title('Precision-Recall vs Threshold')
    plt.legend()
    plt.grid()
    plt.show()

In [12]:
df = pd.read_csv("train_data.csv")


In [13]:
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)
df['Contract'] = df['Contract'].map({'Month-to-month': 2, 'One year': 12, 'Two year': 24})
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0})
df['Dependents'] = df['Dependents'].map({'Yes': 1, 'No': 0})
df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})
df['MultipleLines'] = df['MultipleLines'].map({'Yes': 1, 'No': 0, 'No phone service': 0})
df['OnlineSecurity'] = df['OnlineSecurity'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['OnlineBackup'] = df['OnlineBackup'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['DeviceProtection'] = df['DeviceProtection'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['TechSupport'] = df['TechSupport'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingTV'] = df['StreamingTV'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingMovies'] = df['StreamingMovies'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['PaperlessBilling'] = df['PaperlessBilling'].map({'Yes': 1, 'No': 0})
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check': 2, 'Mailed check': 1, 'Bank transfer (automatic)': 4, 'Credit card (automatic)': 3})
df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No':0})

# VVVFeature engineering

In [14]:
services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
            'StreamingTV', 'StreamingMovies']
df['num_services'] = df[services].sum(axis=1)
df['revenue_proxy'] = df['MonthlyCharges'] * df['tenure']
df['contract_tenure'] = df['Contract'] * df['tenure']
df['tenure_squared'] = df['tenure'] ** 2
df['tenure_cubed'] = df['tenure'] ** 3
df['is_MTM'] = (df['Contract'] == 3).astype(int) # Assuming 3 represents Month-to-month
df['high_charge'] = (df['MonthlyCharges'] > df['MonthlyCharges'].mean()).astype(int)
df['MTM_high_charge'] = df['is_MTM'] * df['high_charge']
df['log_TotalCharges'] = np.log1p(df['TotalCharges'])
df['log_tenure'] = np.log1p(df['tenure'])
df['log_MonthlyCharges'] = np.log1p(df['MonthlyCharges'])
df['log_num_services'] = np.log1p(df['num_services'])
df['log_revenue_proxy'] = np.log1p(df['revenue_proxy'])
df['log_contract_tenure'] = np.log1p(df['contract_tenure'])
df['log_MTM_high_charge'] = np.log1p(df['MTM_high_charge'])
df['log_age'] = np.log1p(df['SeniorCitizen'])

In [15]:
X = df.drop('Churn', axis=1)
y = df['Churn']
datasets = create_kfold_datasets(X, y, n_splits=5, random_state=42)
predictions_test = []

# 1 Logistic Regression ✅

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix



# Initialize the Logistic Regression model


# Train and predict using k-fold cross-validation
def model1(X_train, X_test, y_train, y_test):
    log_reg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver = 'saga', penalty = 'l1'))
    log_reg2 = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, class_weight= 'balanced', solver = 'saga', penalty = 'l1'))
    alpha = 0.8
    beta = 0.5
    trained_model = train_model(X_train, y_train, log_reg)
    trained_model2 = train_model(X_train, y_train, log_reg2)
    y_train_probs1 = trained_model.predict_proba(X_train)[:, 1]
    y_train_probs2 = trained_model2.predict_proba(X_train)[:, 1]
    y_train_probs = (alpha * y_train_probs1 + beta * y_train_probs2) / 2

    y_probs1 = trained_model.predict_proba(X_test)[:, 1]
    y_probs2 = trained_model2.predict_proba(X_test)[:, 1]
    y_probs = ( alpha * y_probs1 + beta * y_probs2) / 2

    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    threshold = thresholds[best_threshold_index]
    print(f"Best threshold: {threshold:.4f} with F1 score: {f1_scores_thresholds[best_threshold_index]:.4f}")
    y_pred = (y_probs >= threshold).astype(int)
    report = classification_report(y_test, y_pred)

    print(report)
    return (y_train_probs, y_probs)
for X_train, X_test, y_train, y_test in datasets:
    model1(X_train, X_test, y_train, y_test)
    break

Best threshold: 0.2141 with F1 score: 0.6479
              precision    recall  f1-score   support

           0       0.91      0.69      0.79       796
           1       0.53      0.84      0.65       329

    accuracy                           0.73      1125
   macro avg       0.72      0.76      0.72      1125
weighted avg       0.80      0.73      0.75      1125



# 2 Decision Tree Classifier ✅

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
def model2(X_train, X_test, y_train, y_test):
    dt_classifier = DecisionTreeClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=1, min_samples_split=2)
    trained_dt_model = train_model(X_train, y_train, dt_classifier)
    y_pred_dt, y_pred_proba_dt, score, report = predict_model(X_test, y_test, trained_dt_model)
    y_train_pred_proba_dt = trained_dt_model.predict_proba(X_train)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_dt)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    threshold = thresholds[best_threshold_index]
    print(classification_report(y_test, y_pred_dt))
    return (y_train_pred_proba_dt, y_pred_proba_dt)
for X_train, X_test, y_train, y_test in datasets:
    model2(X_train, X_test, y_train, y_test)
    break

              precision    recall  f1-score   support

           0       0.87      0.72      0.79       796
           1       0.52      0.74      0.61       329

    accuracy                           0.73      1125
   macro avg       0.70      0.73      0.70      1125
weighted avg       0.77      0.73      0.74      1125



# 3 Random Forest Classifier ✅


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#param_grid = {'n_estimators': [100, 200],'max_depth': [6, 10, None],'min_samples_leaf': [1, 5, 10],'class_weight': [None, 'balanced', {0:1.0, 1:2.5}]}
#grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)


def model3(X_train, X_test, y_train, y_test):
    rf_classifier = RandomForestClassifier(random_state=42, class_weight='balanced', max_depth=6, min_samples_leaf=1, n_estimators=100)
    trained_rf_model = train_model(X_train, y_train, rf_classifier)
    y_pred_dt, y_pred_proba_dt, score, report = predict_model(X_test, y_test, trained_rf_model)
    y_train_pred_proba_dt = trained_rf_model.predict_proba(X_train)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_dt)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    threshold = thresholds[best_threshold_index]
    print(classification_report(y_test, y_pred_dt))
    return (y_train_pred_proba_dt, y_pred_proba_dt)

for X_train, X_test, y_train, y_test in datasets:
    model3(X_train, X_test, y_train, y_test)
    break
#0.64

              precision    recall  f1-score   support

           0       0.88      0.76      0.81       796
           1       0.56      0.76      0.64       329

    accuracy                           0.76      1125
   macro avg       0.72      0.76      0.73      1125
weighted avg       0.79      0.76      0.76      1125



# 4 Gradient Boosting Classifier ✅

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

def model4(X_train, X_test, y_train, y_test):
    alpha = 0.8
    beta = 0.9
    sample_weights = np.where(y_train == 1, 2.5, 1.0)
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train, y_train, sample_weight=sample_weights)
    model2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=42)
    model2.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    #y_pred = (y_proba >= threshold).astype(int)
    y_proba2 = model2.predict_proba(X_test)[:, 1]
    #y_pred2 = (y_proba2 >= threshold).astype(int)
    y_proba3 = alpha * y_proba + beta * y_proba2
    precision, recall, thresholds = precision_recall_curve(y_test, y_proba3)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    threshold = thresholds[best_threshold_index]
    y_pred3 = (y_proba3 >= threshold).astype(int)

    y_train_proba = model.predict_proba(X_train)[:, 1]
    y_train_proba2 = model2.predict_proba(X_train)[:, 1]
    y_train_proba3 = alpha * y_train_proba + beta * y_train_proba2
    print(classification_report(y_test, y_pred3))
    return (y_train_proba3, y_proba3)
for X_train, X_test, y_train, y_test in datasets:
    model4(X_train, X_test, y_train, y_test)
    break

              precision    recall  f1-score   support

           0       0.89      0.73      0.81       796
           1       0.55      0.78      0.64       329

    accuracy                           0.75      1125
   macro avg       0.72      0.76      0.73      1125
weighted avg       0.79      0.75      0.76      1125



# 5 GaussianNB() ✅

In [None]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV

def model5(X_train, X_test, y_train, y_test):
    gnb = GaussianNB()
    calibrated_gnb = CalibratedClassifierCV(gnb, method='sigmoid', cv=5)  # Use sigmoid calibration
    calibrated_gnb.fit(X_train, y_train)
    y_pred_proba = calibrated_gnb.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred = (y_pred_proba >= best_threshold).astype(int)
    y_train_pred = calibrated_gnb.predict_proba(X_train)[:, 1]
    print(classification_report(y_test, y_pred))
    return (y_train_pred, y_pred_proba)
for X_train, X_test, y_train, y_test in datasets:
    model5(X_train, X_test, y_train, y_test)
    break

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       796
           1       0.59      0.60      0.60       329

    accuracy                           0.76      1125
   macro avg       0.71      0.72      0.71      1125
weighted avg       0.76      0.76      0.76      1125



    #6 K neighbors

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
for X_train, X_test, y_train, y_test in datasets:
    param_grid = {
        'n_neighbors': [60],  # Example values, adjust as needed
        'weights': ['uniform', 'distance'],  # Example values
        'metric': ['euclidean', 'manhattan']  # Example values
    }
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1') # Use 5-fold cross-validation
    grid_search.fit(X_train, y_train)
    best_knn = grid_search.best_estimator_
    y_pred_knn = best_knn.predict(X_test)

    print("KNN Classifier:")
    print(grid_search.best_estimator_)
    print(classification_report(y_test, y_pred_knn))

    # Find the best threshold for KNN
    y_pred_proba_knn = best_knn.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_knn)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold_knn = thresholds[best_threshold_index]
    print(f"Best threshold for KNN: {best_threshold_knn}")

    # ... (Rest of your code within the loop)


KNN Classifier:
KNeighborsClassifier(metric='manhattan', n_neighbors=60, weights='distance')
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       796
           1       0.60      0.43      0.51       329

    accuracy                           0.75      1125
   macro avg       0.70      0.66      0.67      1125
weighted avg       0.74      0.75      0.74      1125

Best threshold for KNN: 0.2637954367830825
KNN Classifier:
KNeighborsClassifier(metric='manhattan', n_neighbors=60, weights='distance')
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       831
           1       0.58      0.46      0.51       294

    accuracy                           0.77      1125
   macro avg       0.70      0.67      0.68      1125
weighted avg       0.76      0.77      0.76      1125



  f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])


Best threshold for KNN: 0.9367707629710854
KNN Classifier:
KNeighborsClassifier(metric='manhattan', n_neighbors=60, weights='distance')
              precision    recall  f1-score   support

           0       0.82      0.91      0.87       834
           1       0.65      0.44      0.53       291

    accuracy                           0.79      1125
   macro avg       0.73      0.68      0.70      1125
weighted avg       0.78      0.79      0.78      1125

Best threshold for KNN: 0.362505957152888
KNN Classifier:
KNeighborsClassifier(metric='manhattan', n_neighbors=60, weights='distance')
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       849
           1       0.60      0.43      0.50       276

    accuracy                           0.79      1125
   macro avg       0.72      0.67      0.69      1125
weighted avg       0.78      0.79      0.78      1125

Best threshold for KNN: 0.3270350404849127
KNN Classifier:
KNeighborsClassi

# 7 SVC (no idea now)

In [None]:
# prompt: use svc and grid search to predict ytest

import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve
# Assuming X_train, X_test, y_train, y_test are defined from your k-fold
for X_train, X_test, y_train, y_test in datasets:
    # Create an SVC classifier
    svc = SVC(probability=True)  # Enable probability estimates

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.1, 1]
    }

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(svc, param_grid, scoring='f1')
    grid_search.fit(X_train, y_train)

    # Get the best estimator (SVC model with the best hyperparameters)
    best_svc = grid_search.best_estimator_

    # Predict on the test set using the best SVC model
    y_pred = best_svc.predict(X_test)
    y_pred_proba = best_svc.predict_proba(X_test)[:,1]

    # Evaluate the model
    print(classification_report(y_test, y_pred))

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    print(f"Best threshold for SVC: {best_threshold}")
    y_pred_thresh = (y_pred_proba >= best_threshold).astype(int)

    print(classification_report(y_test, y_pred_thresh))

    break #remove to run for all folds


# 8 XGBoost ✅

In [None]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import confusion_matrix
def model8(X_train, X_test, y_train, y_test):
    xgb_model = XGBClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],  # Example values
        'colsample_bytree': [0.8, 1.0], # Example values
    }
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1) # Use 3-fold to speed it up
    grid_search.fit(X_train, y_train)

    # Get the best estimator
    best_xgb_model = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_xgb_model.predict(X_test)
    y_pred_proba = best_xgb_model.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred_thresh = (y_pred_proba >= best_threshold).astype(int)
    y_train_pred = best_xgb_model.predict_proba(X_train)[:, 1]
    print(classification_report(y_test, y_pred_thresh))
    return (y_train_pred, y_pred_proba)
for X_train, X_test, y_train, y_test in datasets:
    model8(X_train, X_test, y_train, y_test)
    break

              precision    recall  f1-score   support

           0       0.89      0.70      0.79       796
           1       0.52      0.80      0.63       329

    accuracy                           0.73      1125
   macro avg       0.71      0.75      0.71      1125
weighted avg       0.79      0.73      0.74      1125



# 9 Extra trees ✅

In [None]:
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve

def model9(X_train, X_test, y_train, y_test):
    #et_model = ExtraTreesClassifier(random_state=42, class_weight = 'balanced')
    best_et_model = ExtraTreesClassifier(random_state=42, class_weight='balanced', max_depth=10, min_samples_split=10, min_samples_leaf=2)
    #best_et_model = ExtraTreesClassifier(random_state=42
    '''param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }
    grid_search = GridSearchCV(estimator=et_model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_et_model = grid_search.best_estimator_'''
    best_et_model.fit(X_train, y_train)
    y_pred = best_et_model.predict(X_test)
    y_pred_proba = best_et_model.predict_proba(X_test)[:, 1]

    # Evaluate the model
    #print(classification_report(y_test, y_pred))

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1]+1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    #print(f"Best threshold for ExtraTrees: {best_threshold}")
    y_pred_thresh = (y_pred_proba >= best_threshold).astype(int)
    y_train_pred = best_et_model.predict_proba(X_train)[:, 1]
    print(classification_report(y_test, y_pred_thresh))
    return (y_train_pred, y_pred_proba)
for X_train, X_test, y_train, y_test in datasets:
    model9(X_train, X_test, y_train, y_test)
    break

              precision    recall  f1-score   support

           0       0.90      0.72      0.80       796
           1       0.54      0.82      0.65       329

    accuracy                           0.75      1125
   macro avg       0.72      0.77      0.73      1125
weighted avg       0.80      0.75      0.76      1125



# 10 MultiLayer Perception ✅

In [None]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_sample_weight

'''param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (64,), (64, 32), (128, 64)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01, 0,1],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'adaptive']
}

mlp = MLPClassifier(max_iter=500, random_state=42) # Increased max_iter
grid_search = GridSearchCV(mlp, param_grid, scoring='f1', cv=3, n_jobs=-1) # Use 3-fold for speed'''

def model10(X_train, X_test, y_train, y_test):
    weights = compute_sample_weight(class_weight='balanced', y=y_train)
    best_mlp = MLPClassifier(alpha=0.001, hidden_layer_sizes=(50,), max_iter=500,
              random_state=42)
    best_mlp.fit(X_train, y_train)

    #best_mlp = grid_search.best_estimator_

    y_pred = best_mlp.predict(X_test)
    y_pred_proba = best_mlp.predict_proba(X_test)[:, 1]

    #print(classification_report(y_test, y_pred))
    #print(best_mlp)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    #print(f"Best threshold for MLP: {best_threshold}")
    y_pred_thresh = (y_pred_proba >= best_threshold).astype(int)
    y_train_pred = best_mlp.predict_proba(X_train)[:, 1]
    print(classification_report(y_test, y_pred_thresh))
    return (y_train_pred, y_pred_proba)
for X_train, X_test, y_train, y_test in datasets:
    model10(X_train, X_test, y_train, y_test)
    break


              precision    recall  f1-score   support

           0       0.93      0.51      0.66       796
           1       0.43      0.91      0.59       329

    accuracy                           0.63      1125
   macro avg       0.68      0.71      0.62      1125
weighted avg       0.78      0.63      0.64      1125



# 11 Light GBM

In [None]:
import numpy as np
#!pip install lightgbm

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_recall_curve

def model11(X_train, X_test, y_train, y_test):
    '''param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'num_leaves': [31, 50],
        'min_child_samples': [20, 50],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [0, 0.1],
        'class_weight': ['balanced', None]
    }
    lgb_model = lgb.LGBMClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_lgb_model = grid_search.best_estimator_'''

    best_lgm_model = lgb.LGBMClassifier(class_weight='balanced', learning_rate=0.01, max_depth=5,
               n_estimators=200, random_state=42, reg_alpha=0.1,
               reg_lambda=0.1)
    y_pred_proba = best_lgb_model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]

    y_pred = (y_pred_proba >= best_threshold).astype(int)

    print(classification_report(y_test, y_pred))
    print(best_lgb_model)

    return y_pred_proba - best_threshold

for X_train, X_test, y_train, y_test in datasets:
    model11(X_train, X_test, y_train, y_test)
    break


# 12 Multinomial nb ()

In [None]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
def model12(X_train, X_test, y_train, y_test):
    mnb = MultinomialNB()
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0],  # Laplace smoothing parameter
        'fit_prior': [True, False],  # Whether to learn class prior probabilities
        'class_prior': [
          None,
          [0.5, 0.5],
          [0.7, 0.3],
          [0.3, 0.7]
        ]
    }
    grid_search = GridSearchCV(mnb, param_grid, scoring='f1', cv=5) # Use 5-fold cross-validation
    grid_search.fit(X_train, y_train)
    best_mnb = grid_search.best_estimator_
    print(best_mnb)

    y_pred_proba = best_mnb.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    print(f"Best threshold for MultinomialNB: {best_threshold}")

    y_pred = (y_pred_proba >= best_threshold).astype(int)

    print(classification_report(y_test, y_pred))
    print(best_mnb)
    return y_pred_proba - best_threshold

for X_train, X_test, y_train, y_test in datasets:
    model12(X_train, X_test, y_train, y_test)
    break

# 13 Bernouli nb ()

In [None]:
# prompt: use bernouli nb and grid search class weight to predict _test and maximize f1

import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

def model13(X_train, X_test, y_train, y_test):
    bnb = BernoulliNB()
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0],
        'binarize': [0.0, 0.5, 1.0],
        'fit_prior': [True, False],
        'class_prior': [
          None,
          [0.5, 0.5],
          [0.7, 0.3],
          [0.3, 0.7]
        ]
    }
    grid_search = GridSearchCV(bnb, param_grid, scoring='f1', cv=5)
    grid_search.fit(X_train, y_train)
    best_bnb = grid_search.best_estimator_
    print(best_bnb)

    y_pred_proba = best_bnb.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    print(f"Best threshold for BernoulliNB: {best_threshold}")

    y_pred = (y_pred_proba >= best_threshold).astype(int)

    print(classification_report(y_test, y_pred))
    return y_pred_proba - best_threshold

for X_train, X_test, y_train, y_test in datasets:
    model13(X_train, X_test, y_train, y_test)
    break


# 14 CatBoost ✅(grid)

In [None]:
import numpy as np
!pip install catboost

from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

def model14(X_train, X_test, y_train, y_test):
    model = CatBoostClassifier(random_seed=42, verbose=0)

    param_grid = {
        'iterations': [100, 200],
        'learning_rate': [0.01, 0.1],
        'depth': [4, 6],
        'l2_leaf_reg': [1, 3],
        'class_weights': [[1, 1], [1, 2], [1, 3], [1,4], [1,5]] # Example class weights
    }
    grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=3, n_jobs=-1) # Use 3-fold to speed up
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    best_model = grid_search.best_estimator_

    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred = (y_pred_proba >= best_threshold).astype(int)
    y_train_proba = best_model.predict_proba(X_train)[:, 1]
    print(classification_report(y_test, y_pred))
    #print(best_model)
    return (y_train_proba, y_pred_proba)
for X_train, X_test, y_train, y_test in datasets:
    model14(X_train, X_test, y_train, y_test)
    break


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
{'class_weights': [1, 2], 'depth': 4, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.01}
              precision    recall  f1-score   support

           0       0.88      0.76      0.82       796
           1       0.56      0.75      0.64       329

    accuracy                           0.76      1125
   macro avg       0.72      0.75      0.73      1125
weighted avg       0.79      0.76      0.76      1125



# 15 TabNet

In [None]:
import numpy as np
#!pip install pytorch-tabnet

import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.utils.class_weight import compute_sample_weight

# Define the parameter grid for TabNet


def model15(X_train, X_test, y_train, y_test):
    param_grid = {
        'n_d': [8, 16],
        'n_a': [8, 16],
        'n_steps': [3, 5],
        'gamma': [1.3, 1.5],
        'n_independent': [1,2],
        'lambda_sparse': [1e-3, 1e-4],
        'optimizer_fn': [torch.optim.Adam],
        'optimizer_params': [dict(lr=2e-2)],
        'mask_type': ['entmax'],
        'scheduler_params': [dict(mode="min", patience=5, min_lr=1e-5, factor=0.9)],
        'scheduler_fn': [torch.optim.lr_scheduler.ReduceLROnPlateau],
        'verbose': [10]
    }
    tabnet_model = TabNetClassifier()
    f1_scorer = make_scorer(f1_score, average='weighted')
    grid_search = GridSearchCV(estimator=tabnet_model, param_grid=param_grid, scoring=f1_scorer, cv=3, n_jobs=-1, verbose=1)
    #sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

    grid_search.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], eval_metric=['f1'])

    best_tabnet_model = grid_search.best_estimator_
    y_pred = best_tabnet_model.predict(X_test.values)
    y_pred_proba = best_tabnet_model.predict_proba(X_test.values)[:,1]

    print("Classification Report for TabNet:")
    print(classification_report(y_test, y_pred))

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred_thresh = (y_pred_proba >= best_threshold).astype(int)

    print(classification_report(y_test, y_pred_thresh))

    print(best_tabnet_model)
    return y_pred_proba - best_threshold
for X_train, X_test, y_train, y_test in datasets:
    model15(X_train, X_test, y_train, y_test)
    break # Remove to run for all folds


# 16 Node2Vec

In [None]:
import numpy as np
!pip install node2vec

import networkx as nx
from node2vec import Node2Vec

def create_graph_from_dataframe(df):
    graph = nx.Graph()
    for index, row in df.iterrows():
      graph.add_node(index, features=row.to_dict()) # or use a subset of columns
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            similarity = np.dot(df.iloc[i], df.iloc[j])/(np.linalg.norm(df.iloc[i])*np.linalg.norm(df.iloc[j]))
            if similarity > 0.5: # Example threshold
                graph.add_edge(i, j, weight=similarity)
    return graph

def get_node2vec_embeddings(graph, dimensions=64, walk_length=30, num_walks=200, workers=4):
  node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers)
  model = node2vec.fit(window=10, min_count=1, batch_words=4)
  embeddings = {node: model.wv[str(node)] for node in graph.nodes()}
  return embeddings

def model16(X_train, X_test, y_train, y_test):
    train_graph = create_graph_from_dataframe(X_train)
    embeddings_train = get_node2vec_embeddings(train_graph)
    X_train_embeddings = np.array([embeddings_train[i] for i in range(len(X_train))])


    test_graph = create_graph_from_dataframe(X_test)
    embeddings_test = get_node2vec_embeddings(test_graph)
    X_test_embeddings = np.array([embeddings_test[i] for i in range(len(X_test))])


    param_grid = {
        'class_weight': [None, 'balanced', {0:1, 1:10}],
        # ... other hyperparameters for your classifier
    }

    # Example classifier: Logistic Regression
    clf = LogisticRegression()

    grid_search = GridSearchCV(clf, param_grid, scoring='f1', cv=3)  # Use 3-fold for speed

    grid_search.fit(X_train_embeddings, y_train)
    best_clf = grid_search.best_estimator_
    y_pred = best_clf.predict(X_test_embeddings)
    y_pred_proba = best_clf.predict_proba(X_test_embeddings)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred_thresh = (y_pred_proba >= best_threshold).astype(int)
    print(classification_report(y_test, y_pred))
    print(best_clf)
    return y_pred_proba - best_threshold
for X_train, X_test, y_train, y_test in datasets:
    model16(X_train, X_test, y_train, y_test)
    break

# 17 NGBoost

In [None]:
import numpy as np
!pip install ngboost
from ngboost import NGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def model17(X_train, X_test, y_train, y_test):
    param_grid = {
        'Dist': ['Normal', 'LogNormal'],
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1],
        'minibatch_frac': [0.5, 1.0],
        'natural_gradient': [True, False],
        'verbose': [False],
        'Base': [DecisionTreeClassifier(max_depth=3)] # Example base learner, you can experiment
    }
    ngb_model = NGBClassifier()
    grid_search = GridSearchCV(estimator=ngb_model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_ngb_model = grid_search.best_estimator_
    y_pred_proba = best_ngb_model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred = (y_pred_proba >= best_threshold).astype(int)
    print(classification_report(y_test, y_pred))
    print(best_ngb_model)

    return y_pred_proba - best_threshold

for X_train, X_test, y_train, y_test in datasets:
  model17(X_train, X_test, y_train, y_test)
  break


# 18 DeepFM

In [None]:
# prompt: use DeepFM and grid search (including but not limited to class weight) to predict y_test and maximize f1

!pip install deepctr

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

# Assuming your data is preprocessed as in the previous example
# ... (Your existing code for data loading and preprocessing)

# Create feature columns
sparse_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                   'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
dense_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'num_services', 'revenue_proxy',
                  'contract_tenure', 'tenure_squared', 'tenure_cubed', 'is_MTM', 'high_charge',
                  'MTM_high_charge', 'log_TotalCharges', 'log_tenure', 'log_MonthlyCharges',
                  'log_num_services', 'log_revenue_proxy', 'log_contract_tenure',
                  'log_MTM_high_charge', 'log_age']


for feat in sparse_features:
    lbe = LabelEncoder()
    X[feat] = lbe.fit_transform(X[feat])
mms = MinMaxScaler(feature_range=(0,1))
X[dense_features] = mms.fit_transform(X[dense_features])

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=X[feat].nunique(),embedding_dim=4)
                        for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                        for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'dnn_hidden_units': [(128, 64), (256, 128)],
    'l2_reg_linear': [0.001, 0.01],
    'l2_reg_embedding': [0.001, 0.01],
    'l2_reg_dnn': [0, 0.001],
    'dnn_dropout': [0, 0.1],
    'class_weight': [{0:1, 1:1.5}, {0:1, 1:2}, 'balanced']
}

# Initialize DeepFM model
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')

# Perform GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=1) # Use 3-fold for speed
grid_search.fit(X_train[feature_names], y_train, )

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred_proba = best_model.predict(X_test[feature_names], batch_size=256)

# Find the best threshold for maximizing F1-score
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
best_threshold_index = np.argmax(f1_scores_thresholds)
best_threshold = thresholds[best_threshold_index]

y_pred = (y_pred_proba >= best_threshold).astype(int)
print(classification_report(y_test, y_pred))


# 19 RuleFit

In [None]:
import numpy as np
!pip install rulefit

from sklearn.model_selection import GridSearchCV
from rulefit import RuleFit

def model19(X_train, X_test, y_train, y_test):
    # Define the parameter grid for RuleFit
    param_grid = {
        'max_rules': [50, 100],  # Maximum number of rules to generate
        'tree_size': [2, 4],      # Maximum size of each rule tree
        'sample_fract': [0.7, 0.9], # Fraction of samples to use for rule generation
        'memory_par': [0.01, 0.05], # Memory parameter for rule generation
        'exp_rand_tree_size': [True, False], # Use randomized tree sizes
        'class_weight': ['balanced', None],
    }

    # Initialize RuleFit
    rulefit_model = RuleFit()

    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=rulefit_model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_rulefit_model = grid_search.best_estimator_

    # Predict on the test set
    y_pred_proba = best_rulefit_model.predict_proba(X_test)[:, 1]

    # Find the best threshold for maximizing F1-score
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]

    y_pred = (y_pred_proba >= best_threshold).astype(int)
    print(classification_report(y_test, y_pred))
    print(best_rulefit_model)
    return y_pred_proba - best_threshold


for X_train, X_test, y_train, y_test in datasets:
    model19(X_train, X_test, y_train, y_test)
    break


# 20 VIME

In [None]:
import numpy as np
!pip install VIME

import VIME

def model20(X_train, X_test, y_train, y_test):
    # Initialize VIME model
    vime_model = VIME.VIME()

    # Define parameter grid for GridSearchCV (adjust as needed)
    param_grid = {
        'n_estimators': [50, 100],  # Example values, adjust based on your data
        'learning_rate': [0.01, 0.1],  # Example values
        'max_depth': [3, 5],  # Example values
        'class_weight': [None, 'balanced'] # Include class_weight
    }

    # Use GridSearchCV to find optimal hyperparameters
    grid_search = GridSearchCV(vime_model, param_grid, scoring='f1', cv=3, n_jobs=-1)  # Use 3-fold for speed
    grid_search.fit(X_train, y_train)

    best_vime_model = grid_search.best_estimator_
    y_pred_proba = best_vime_model.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]
    y_pred = (y_pred_proba >= best_threshold).astype(int)

    print(classification_report(y_test, y_pred))
    print(best_vime_model)
    return y_pred_proba - best_threshold

for X_train, X_test, y_train, y_test in datasets:
    model20(X_train, X_test, y_train, y_test)
    break


# 21 Logistic Regression with Polynomial Features

In [None]:
# prompt: use logistic regression with polynomial features and grid search (including but not limited to class weight) to predict y_test and maximize f1

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

# Create polynomial features
poly = PolynomialFeatures(degree=2) # Example degree, adjust as needed
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Define the parameter grid for Logistic Regression with polynomial features
param_grid = {
    'logisticregression__C': [0.1, 1, 10],  # Regularization parameter
    'logisticregression__class_weight': [None, 'balanced'], # Class weights
    'logisticregression__solver': ['liblinear', 'saga'], # Solvers
    'logisticregression__penalty': ['l1', 'l2'] # Penalty
}

# Create a pipeline with polynomial features and Logistic Regression
log_reg_poly = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), LogisticRegression(max_iter=1000))

# Perform GridSearchCV
grid_search_poly = GridSearchCV(log_reg_poly, param_grid, scoring='f1', cv=5) # Use 5-fold cross-validation
grid_search_poly.fit(X_train, y_train)


# Get the best model
best_log_reg_poly = grid_search_poly.best_estimator_

# Predict on the test set
y_pred_proba = best_log_reg_poly.predict_proba(X_test)[:, 1]

# Find the best threshold for maximizing F1-score
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-8)
best_threshold_index = np.argmax(f1_scores_thresholds)
best_threshold = thresholds[best_threshold_index]

y_pred = (y_pred_proba >= best_threshold).astype(int)
print(classification_report(y_test, y_pred))
best_log_reg_poly


# 22 Factorization Machines

In [None]:
import numpy as np
!pip install pyfm

from pyfm import pylibfm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

def model22(X_train, X_test, y_train, y_test):
    # Convert pandas DataFrames to numpy arrays
    X_train = X_train.values
    X_test = X_test.values
    y_train = y_train.values
    y_test = y_test.values

    # Define the parameter grid for Factorization Machines
    param_grid = {
        'num_factors': [8, 16, 32],
        'num_iterations': [50, 100],
        'learning_rate': [0.01, 0.1],
        'regularization': [0.01, 0.1],
        'class_weight': [{0:1, 1:1.5}, {0:1, 1:2}, 'balanced'] # Include class_weight
        }

    # Initialize Factorization Machines model
    fm = pylibfm.FM()
    f1_scorer = make_scorer(f1_score, average='weighted')

    # Perform GridSearchCV
    grid_search = GridSearchCV(fm, param_grid, scoring=f1_scorer, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_fm_model = grid_search.best_estimator_

    # Predict probabilities on the test set
    y_pred_proba = best_fm_model.predict(X_test)

    # Find the best threshold for maximizing F1-score
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])
    best_threshold_index = np.argmax(f1_scores_thresholds)
    best_threshold = thresholds[best_threshold_index]

    y_pred = (y_pred_proba >= best_threshold).astype(int)
    print(classification_report(y_test, y_pred))
    print(best_fm_model)
    return y_pred_proba - best_threshold

for X_train, X_test, y_train, y_test in datasets:
    model22(X_train, X_test, y_train, y_test)
    break


# Meta Model

In [None]:
# Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


# 1) Logistic Regression
model1 = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, class_weight = 'balanced', solver = 'saga', penalty = 'l1', C = 0.5))
# 2) Decision Tree
#model2 = DecisionTreeClassifier(random_state=42)
# 3) Random Forest
model3 = RandomForestClassifier(random_state=42)
# 4) Gradient Boosting (scikit-learn)
model4 = GradientBoostingClassifier(random_state=42)
# 5) Gaussian Naive Bayes
model5 = GaussianNB()
# 6) KNeighbors
model6 = KNeighborsClassifier()
# 7) SVM
model7 = SVC(probability=True, random_state=42)
# 8) XGBoost
model8 = XGBClassifier(eval_metric='logloss', random_state=42)
# 9) Extra Trees
model9 = ExtraTreesClassifier(random_state=42)
# 10) Multi-Layer Perceptron
model10 = MLPClassifier(max_iter=300, random_state=42)
# light gbm
#multinomial nb
#bernouli nb

# Put them in a list for convenience
models = [
    ("LogisticRegression", model1),
    #("DecisionTree", model2),
    #("RandomForest", model3),
    #("GradientBoosting", model4),
    ("NaiveBayes", model5),
    #("KNN", model6),
    #("SVM", model7),
    #("XGB", model8),
    #("ExtraTrees", model9),
    ("MLP", model10)
]

In [None]:
# prompt: see which features impact churn the most

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'X_train' and 'model1' (Logistic Regression) are defined from the previous code

# Fit the model (if not already fitted)
for model in models:
  model = model[1]
  model.fit(X_train, y_train)

  # Get feature importances
  feature_importances = model.coef_[0]

  # Create a DataFrame for easier handling
  feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

  # Sort by absolute importance (to see both positive and negative impacts)
  feature_importance_df = feature_importance_df.reindex(feature_importance_df['Importance'].abs().sort_values(ascending=False).index)

  # Plot the feature importances
  plt.figure(figsize=(10, 6))
  sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
  plt.title('Feature Importance (Logistic Regression)')
  plt.xlabel('Importance')
  plt.ylabel('Feature')
  plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

feat_imp_rf = pd.Series(rf.feature_importances_, index=X_train.columns)
feat_imp_rf = feat_imp_rf.sort_values(ascending=False)

print("Top 10 RandomForest features:")
print(feat_imp_rf.head(10))

# Meta Model

In [None]:
train_preds = []
for name, m in models:
    train_preds.append(m.predict_proba(X_train)[:, 1])

meta_X_train = pd.DataFrame(np.column_stack(train_preds), columns=[n for n,_ in models])
meta_y_train = y_train

meta_model = LogisticRegression(max_iter=1000, random_state=42)

meta_model.fit(meta_X_train, meta_y_train)
print("\nMeta-model has been trained on stacked predictions of the 10 base models.")

In [None]:
# Combine base models' test predictions
test_preds = []
for name, m in models:
    predictions_test = m.predict_proba(X_test)[:, 1]
    test_preds.append(predictions_test)

meta_X_test = pd.DataFrame(np.column_stack(test_preds), columns=[n for n,_ in models])

# Predict using the meta-model
meta_test_pred = meta_model.predict(meta_X_test)
meta_test_prob = meta_model.predict_proba(meta_X_test)[:, 1]

# Evaluate the final stacked model
print("\n***** Evaluation of the final (stacked) meta-model *****")
print(classification_report(y_test, meta_test_pred))


In [None]:
models = [('1',model1), ('2', model2), ('3', model3), ('4', model4), ('5', model5), ('8', model8), ('9', model9), ('14', model14)]#, ('14', model14)]
def stack_models(X_train, X_test, y_train, y_test, models):
    X_train2 = X_train.copy()
    X_test2 = X_test.copy()

    for name, model in models:
        X_train2[name], X_test2[name] = model(X_train, X_test, y_train, y_test)
    for name, model in models:
        X_train2['final'+name], X_test2['final'+name] = model(X_train2, X_test2, y_train, y_test)

    return X_train2, X_test2, y_train, y_test


def evalulate(X_train2, X_test2, y_train, y_test):

    meta_model = LogisticRegression(max_iter=1000, class_weight='balanced',random_state=42, solver = 'saga', penalty = 'l1')
    #meta_model = XGBClassifier(eval_metric='logloss', random_state=42)
    #meta_model = RandomForestClassifier(random_state=42)
    #meta_model = GradientBoostingClassifier(random_state=42)
    meta_model.fit(X_train2, y_train)
    meta_test_pred = meta_model.predict(X_test2)

    print("\n***** Evaluation of the final (stacked) meta-model *****")
    print(classification_report(y_test, meta_test_pred))
    print(meta_model.coef_)

In [None]:
for i, (X_train, X_test, y_train, y_test) in enumerate(datasets):
    X_train2, X_test2, y_train2, y_test2 = stack_models(X_train, X_test, y_train, y_test, models)
    evalulate(X_train2, X_test2, y_train2, y_test2)
    break

              precision    recall  f1-score   support

           0       0.91      0.69      0.79       796
           1       0.53      0.84      0.65       329

    accuracy                           0.73      1125
   macro avg       0.72      0.76      0.72      1125
weighted avg       0.80      0.73      0.75      1125

              precision    recall  f1-score   support

           0       0.87      0.72      0.79       796
           1       0.52      0.74      0.61       329

    accuracy                           0.73      1125
   macro avg       0.70      0.73      0.70      1125
weighted avg       0.77      0.73      0.74      1125

              precision    recall  f1-score   support

           0       0.88      0.76      0.81       796
           1       0.56      0.76      0.64       329

    accuracy                           0.76      1125
   macro avg       0.72      0.76      0.73      1125
weighted avg       0.79      0.76      0.76      1125

              preci

  f1_scores_thresholds = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1])


              precision    recall  f1-score   support

           0       0.71      1.00      0.83       796
           1       0.00      0.00      0.00       329

    accuracy                           0.71      1125
   macro avg       0.35      0.50      0.41      1125
weighted avg       0.50      0.71      0.59      1125

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       796
           1       0.61      0.54      0.57       329

    accuracy                           0.76      1125
   macro avg       0.71      0.70      0.71      1125
weighted avg       0.76      0.76      0.76      1125

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       796
           1       0.61      0.51      0.56       329

    accuracy                           0.76      1125
   macro avg       0.71      0.69      0.70      1125
weighted avg       0.75      0.76      0.76      1125

              preci



# Final Answer

In [19]:
df = pd.read_csv("test_data.csv")
customerID = df['customerID']
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)
df['Contract'] = df['Contract'].map({'Month-to-month': 2, 'One year': 12, 'Two year': 24})
df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0})
df['Dependents'] = df['Dependents'].map({'Yes': 1, 'No': 0})
df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})
df['MultipleLines'] = df['MultipleLines'].map({'Yes': 1, 'No': 0, 'No phone service': 0})
df['OnlineSecurity'] = df['OnlineSecurity'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['OnlineBackup'] = df['OnlineBackup'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['DeviceProtection'] = df['DeviceProtection'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['TechSupport'] = df['TechSupport'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingTV'] = df['StreamingTV'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingMovies'] = df['StreamingMovies'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['PaperlessBilling'] = df['PaperlessBilling'].map({'Yes': 1, 'No': 0})
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check': 2, 'Mailed check': 1, 'Bank transfer (automatic)': 4, 'Credit card (automatic)': 3})
df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No':0})
services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
            'StreamingTV', 'StreamingMovies']
df['num_services'] = df[services].sum(axis=1)
df['revenue_proxy'] = df['MonthlyCharges'] * df['tenure']
df['contract_tenure'] = df['Contract'] * df['tenure']
df['tenure_squared'] = df['tenure'] ** 2
df['tenure_cubed'] = df['tenure'] ** 3
df['is_MTM'] = (df['Contract'] == 3).astype(int) # Assuming 3 represents Month-to-month
df['high_charge'] = (df['MonthlyCharges'] > df['MonthlyCharges'].mean()).astype(int)
df['MTM_high_charge'] = df['is_MTM'] * df['high_charge']
df['log_TotalCharges'] = np.log1p(df['TotalCharges'])
df['log_tenure'] = np.log1p(df['tenure'])
df['log_MonthlyCharges'] = np.log1p(df['MonthlyCharges'])
df['log_num_services'] = np.log1p(df['num_services'])
df['log_revenue_proxy'] = np.log1p(df['revenue_proxy'])
df['log_contract_tenure'] = np.log1p(df['contract_tenure'])
df['log_MTM_high_charge'] = np.log1p(df['MTM_high_charge'])
df['log_age'] = np.log1p(df['SeniorCitizen'])
X_test = df.copy()

In [20]:
def finalmodel1(X_train, X_test, y_train):
    log_reg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver = 'saga', penalty = 'l1'))
    log_reg2 = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, class_weight= 'balanced', solver = 'saga', penalty = 'l1'))
    alpha = 0.8
    beta = 0.5
    trained_model = train_model(X_train, y_train, log_reg)
    trained_model2 = train_model(X_train, y_train, log_reg2)
    y_train_probs1 = trained_model.predict_proba(X_train)[:, 1]
    y_train_probs2 = trained_model2.predict_proba(X_train)[:, 1]
    y_train_probs = (alpha * y_train_probs1 + beta * y_train_probs2) / 2

    y_probs1 = trained_model.predict_proba(X_test)[:, 1]
    y_probs2 = trained_model2.predict_proba(X_test)[:, 1]
    y_probs = ( alpha * y_probs1 + beta * y_probs2) / 2


    threshold = 0.2141 
    y_pred = (y_probs >= threshold).astype(int)
    y_pred = pd.Series(y_pred).map({1: 'Yes', 0: 'No'})
    output_df = pd.DataFrame({'customerID': customerID, 'Churn': y_pred})
    output_df.to_csv('submission.csv', index=False)
    
finalmodel1(X, X_test, y)