In [10]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

# Load the data
df_student = pd.read_csv('data/cleaned_data.csv')  # Adjust the path as necessary

# Drop 'Nationality' and 'International' columns if needed
df_student = df_student.drop(columns=['Nationality', 'International'])

display(df_student)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(max_iter=2000, random_state=42))
])

param_grid_lr = {
    'logistic__C': [0.01, 0.1, 1, 10, 100],
    'logistic__solver': ['liblinear', 'lbfgs', 'newton-cg', 'saga']
}

grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)
best_model_lr = grid_search_lr.best_estimator_

# Train Random Forest
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('random_forest', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'random_forest__n_estimators': [50, 100],
    'random_forest__max_depth': [None, 10, 20],
    'random_forest__min_samples_split': [2, 5],
    'random_forest__min_samples_leaf': [1, 2],
}

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_model_rf = grid_search_rf.best_estimator_

# Train SVC
pipeline_svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])

param_grid_svc = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [1, 0.1, 0.01],
    'svc__kernel': ['rbf']
}

grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_svc.fit(X_train, y_train)
best_model_svc = grid_search_svc.best_estimator_

# Train XGBoost
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(eval_metric='mlogloss', use_label_encoder=False))
])

param_grid_xgb = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 5, 7],
    'xgb__subsample': [0.8, 1]
}

grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)
best_model_xgb = grid_search_xgb.best_estimator_

# Evaluate individual models
models = {
    'Logistic Regression': best_model_lr,
    'Random Forest': best_model_rf,
    'SVC': best_model_svc,
    'XGBoost': best_model_xgb
}

f1_scores_train = {}
f1_scores_test = {}

for model_name, model in models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    f1_scores_train[model_name] = f1_score(y_train, y_train_pred)
    f1_scores_test[model_name] = f1_score(y_test, y_test_pred)
    print(f"{model_name} F1 Score on Training Data: {f1_scores_train[model_name]:.4f}")
    print(f"{model_name} F1 Score on Test Data: {f1_scores_test[model_name]:.4f}")

# Visualization of model comparison
def barplot_models(model_scores, title):
    model_names = list(model_scores.keys())
    f1_scores = list(model_scores.values())
    plt.figure(figsize=(8, 6))
    sns.set_palette("YlGnBu")
    bars = sns.barplot(x=model_names, y=f1_scores)
    plt.title(title)
    plt.xlabel("Model")
    plt.ylabel("F1 Score")
    plt.ylim(0, 1)
    for bar, f1_score in zip(bars.patches, f1_scores):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02, f"{f1_score:.2f}", ha="center", va="bottom")
    plt.show()

barplot_models(model_scores=f1_scores_train, title="Model Comparison Plot Training")
barplot_models(model_scores=f1_scores_test, title="Model Comparison Plot Test")

Unnamed: 0,Marital_Status,Application_mode,Application_order,Course,Daytime_evening_attendance,Previous_qualification,Previous_qualification_(grade),Mothers_qualification,Fathers_qualification,Mothers_occupation,...,Curricular_units_2nd_sem_credited,Curricular_units_2nd_sem_enrolled,Curricular_units_2nd_sem_evaluations,Curricular_units_2nd_sem_approved,Curricular_units_2nd_sem_grade,Curricular_units_2nd_sem_without_evaluations,Unemployment_rate,Inflation_rate,GDP,Target
0,1,17,5,171,1,1,122.0,19,12,5,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,3,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,37,37,9,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,38,37,5,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,37,38,9,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,5,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,1,1,9,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,37,37,9,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,37,37,7,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


NameError: name 'X' is not defined

In [None]:
from sklearn.ensemble import StackingClassifier

# Define base learners
base_learners = [
    ('lr', best_model_lr),
    ('rf', best_model_rf),
    ('svc', best_model_svc),
    ('xgb', best_model_xgb)
]

# Define meta-learner
meta_learner = LogisticRegression()

# Create stacking classifier
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=3)

# Fit stacking classifier
stacking_clf.fit(X_train, y_train)

# Evaluate the stacking classifier
y_train_pred_stacking = stacking_clf.predict(X_train)
y_test_pred_stacking = stacking_clf.predict(X_test)

# Calculate F1 scores
f1_score_train_stacking = f1_score(y_train, y_train_pred_stacking)
f1_score_test_stacking = f1_score(y_test, y_test_pred_stacking)

print(f"Stacking Classifier F1 Score on Training Data: {f1_score_train_stacking:.4f}")
print(f"Stacking Classifier F1 Score on Test Data: {f1_score_test_stacking:.4f}")

In [None]:
from sklearn.ensemble import VotingClassifier

# Define the VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', best_model_lr),
    ('rf', best_model_rf),
    ('svc', best_model_svc),
    ('xgb', best_model_xgb)
], voting='soft')

voting_clf.fit(X_train, y_train)

# Evaluate the voting classifier
y_train_pred_voting = voting_clf.predict(X_train)
y_test_pred_voting = voting_clf.predict(X_test)

# Calculate F1 scores
f1_score_train_voting = f1_score(y_train, y_train_pred_voting)
f1_score_test_voting = f1_score(y_test, y_test_pred_voting)

print(f"Voting Classifier F1 Score on Training Data: {f1_score_train_voting:.4f}")
print(f"Voting Classifier F1 Score on Test Data: {f1_score_test_voting:.4f}")