In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# --- Load Dataset ---
df = pd.read_csv("/content/train_cleaned (3).csv")  # adjust if path is different

In [None]:
# Parse program age ranges
age_split = df['Age Range by Program'].astype(str).str.extract(r'(?P<min_age>\d+)-(?P<max_age>\d+)')
df['Program Min Age'] = age_split['min_age'].astype(float)
df['Program Max Age'] = age_split['max_age'].astype(float)

In [None]:
# Age-based features
df['Age In Range'] = df.apply(lambda row: 1 if row['Program Min Age'] <= row['Age'] <= row['Program Max Age'] else 0, axis=1)
df['Age Gap To Min'] = df['Age'] - df['Program Min Age']
df['Age Gap To Max'] = df['Program Max Age'] - df['Age']
df['Extreme Age'] = df.apply(lambda row: 1 if row['Age'] < row['Program Min Age'] - 5 or row['Age'] > row['Program Max Age'] + 5 else 0, axis=1)

In [None]:
# Drop text column
df.drop(columns=['Age Range by Program'], inplace=True)

# Drop irrelevant columns
df.drop(columns=['Student ID', 'Home City', 'Program ID', 'Program Start Date', 'Program End Date'], inplace=True, errors='ignore')

In [None]:
X = df.drop(columns=['Y'])
y = df['Y'].map({0: 1, 1: 0})  # 1 = completed, 0 = quit

In [None]:
# Encode binary columns
df['Completed Degree'] = df['Completed Degree'].map({'Yes': 1, 'No': 0})
df['Still Working'] = df['Still Working'].map({'Yes': 1, 'No': 0})

# Encode categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].fillna('Unknown'))
    label_encoders[col] = le  # Store the fitted LabelEncoder

# Scale numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)  # Fit the scaler on training data


In [None]:
# Features and target
test_df = pd.read_csv('/content/test_cleaned (3).csv')

age_split = test_df['Age Range by Program'].astype(str).str.extract(r'(?P<min_age>\d+)-(?P<max_age>\d+)')
test_df['Program Min Age'] = age_split['min_age'].astype(float)
test_df['Program Max Age'] = age_split['max_age'].astype(float)

# Age-based features
test_df['Age In Range'] = test_df.apply(lambda row: 1 if row['Program Min Age'] <= row['Age'] <= row['Program Max Age'] else 0, axis=1)
test_df['Age Gap To Min'] = test_df['Age'] - test_df['Program Min Age']
test_df['Age Gap To Max'] = test_df['Program Max Age'] - test_df['Age']
test_df['Extreme Age'] = test_df.apply(lambda row: 1 if row['Age'] < row['Program Min Age'] - 5 or row['Age'] > row['Program Max Age'] + 5 else 0, axis=1)
test_df['Y'] = test_df['Completed Degree'].apply(lambda x: 1 if x == 'Yes' else 0)


# Drop text column
test_df.drop(columns=['Age Range by Program'], inplace=True)

# Drop irrelevant columns
test_df.drop(columns=['Student ID', 'Home City', 'Program ID', 'Program Start Date', 'Program End Date'], inplace=True, errors='ignore')


test_df['Completed Degree'] = test_df['Completed Degree'].map({'Yes': 1, 'No': 0})
test_df['Still Working'] = test_df['Still Working'].map({'Yes': 1, 'No': 0})

# Encode categorical columns using the same label encoders
categorical_cols = test_df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    if col in label_encoders:
        test_df[col] = label_encoders[col].transform(test_df[col])

X_test = test_df.drop(columns=['Y'])
y_test = test_df['Y']

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
# Define model grids
model_grids = {
    "Logistic Regression": (
        LogisticRegression(class_weight='balanced', max_iter=1000),
        {"C": [0.01, 0.1, 1, 10]}
    ),
    "SVM": (
        SVC(class_weight='balanced', probability=True),
        {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    ),
    "Random Forest": (
        RandomForestClassifier(class_weight='balanced', random_state=42),
        {"n_estimators": [100, 200], "max_depth": [None, 10, 20]}
    ),
    "Decision Tree": (
        DecisionTreeClassifier(class_weight='balanced', random_state=42),
        {"max_depth": [None, 10, 20], "min_samples_split": [2, 5]}
    ),
     "KNN": (
        KNeighborsClassifier(),
        {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
    )
}

In [None]:
results = {}
for name, (model, params) in model_grids.items():
    grid = GridSearchCV(model, params, scoring='f1', cv=5, n_jobs=-1)
    grid.fit(X_scaled, y)
    y_pred = grid.predict(X_test)
    results[name] = {
        "Model": name,
        "Best Params": grid.best_params_,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "Best Estimator": grid.best_estimator_
    }

# Prepare to save the best model
results_df = pd.DataFrame(results).T.sort_values(by="F1 Score", ascending=False)
best_model_name = results_df.iloc[0]["Model"]
best_model = results[best_model_name]["Best Estimator"]

# Save the best model, scaler, and label encoders
joblib.dump(best_model, "best_model_extreme_age.pkl")
joblib.dump(scaler, "scaler_extreme_age.pkl")
joblib.dump(label_encoders, "label_encoders_extreme_age.pkl")

In [None]:

print(f"\nBest model saved: {best_model_name}")
print("Files saved: best_model.pkl, scaler.pkl, label_encoders.pkl\n")
print("Model Performance Summary:")
print(results_df.drop(columns='Best Estimator'))

In [None]:

import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Assuming results_df is already defined and contains model performance metrics

# Prepare the data for bar plot
df_long = results_df.reset_index().melt(id_vars='Model',
                          value_vars=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
                          var_name='Metric', value_name='Score')

# Create bar plot for model performance comparison
fig = px.bar(df_long,
             x='Model',
             y='Score',
             color='Metric',
             barmode='group',
             title='Model Performance Comparison',
             text='Score',
             height=600,
             color_discrete_sequence=px.colors.sequential.Blues[2:]
             )

fig.update_layout(
    xaxis_title='Model',
    yaxis_title='Score',
    legend_title='Metric',
    font=dict(size=14),
    plot_bgcolor='white',
    yaxis=dict(showgrid=True, gridcolor='lightgray'),
    bargap=0.4
)

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.show()

In [None]:
best_rf_model = results["Random Forest"]["Best Estimator"]
y_pred_rf = best_rf_model.predict(X_test)  # Make sure X_test is defined

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)  # Ensure y_test is defined
cm_percent = cm / cm.sum() * 100

labels = [["TN", "FP"], ["FN", "TP"]]
annotations = [[f"{label}<br>{value}<br>({percent:.1f}%)"
                for label, value, percent in zip(row_l, row_v, row_p)]
               for row_l, row_v, row_p in zip(labels, cm, cm_percent)]

# Create heatmap for confusion matrix
fig_cm = go.Figure(data=go.Heatmap(
    z=cm,
    x=["Predicted Negative", "Predicted Positive"],
    y=["Actual Negative", "Actual Positive"],
    text=annotations,
    texttemplate="%{text}",
    colorscale="Blues",
    showscale=True,
    hoverinfo="skip"
))

fig_cm.update_layout(
    title="Confusion Matrix for Random Forest",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    font=dict(size=18),
    width=800,
    height=550,
    margin=dict(t=80, l=100, r=20, b=80)
)

fig_cm.show()