In [1]:
import sys
sys.path.append("../src")

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from preprocess import (
    split_features_target,
    train_test_split,
    scale_features
)

from model import (
    train_logistic_regression,
    train_random_forest
)

In [3]:
# loading the preprocessed data
df = pd.read_csv("../data/processed/admissions_cleaned.csv")
df.head()

Unnamed: 0,Serial_No.,GRE_Score,TOEFL_Score,University_Rating,SOP,LOR,CGPA,Research,Chance_of_Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [6]:
TARGET = "Chance_of_Admit"

df["Admitted"] = (df[TARGET] >= 0.5).astype(int)

X, y = split_features_target(df.drop(columns=[TARGET]), "Admitted")

X.head(), y.value_counts()

(   Serial_No.  GRE_Score  TOEFL_Score  University_Rating  SOP  LOR  CGPA  \
 0           1        337          118                  4  4.5  4.5  9.65   
 1           2        324          107                  4  4.0  4.5  8.87   
 2           3        316          104                  3  3.0  3.5  8.00   
 3           4        322          110                  3  3.5  2.5  8.67   
 4           5        314          103                  2  2.0  3.0  8.21   
 
    Research  
 0         1  
 1         1  
 2         1  
 3         1  
 4         0  ,
 Admitted
 1    463
 0     37
 Name: count, dtype: int64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [8]:
X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

In [9]:
log_model = train_logistic_regression(X_train_scaled, y_train)
rf_model = train_random_forest(X_train, y_train)

## Model Evaluation

In [10]:
from evaluate import (
    evaluate_classification_model,
    plot_confusion_matrix,
    plot_feature_importance
)


In [12]:
log_metrics = evaluate_classification_model(
    log_model,
    X_test_scaled,
    y_test,
    model_name="Logistic Regression"
)

print("--- Logistic Regression model evaluation ---")
log_metrics

--- Logistic Regression model evaluation ---


{'model': 'Logistic Regression',
 'accuracy': 0.936,
 'recall': 0.9829059829059829,
 'f1_score': 0.9663865546218487}

In [14]:
rf_metrics = evaluate_classification_model(
    rf_model,
    X_test,
    y_test,
    model_name="Random Forest"
)

print("--- Random Forest model evaluation ---")
rf_metrics

--- Random Forest model evaluation ---


{'model': 'Random Forest',
 'accuracy': 0.952,
 'recall': 0.9914529914529915,
 'f1_score': 0.9747899159663865}

In [15]:
results = pd.DataFrame([log_metrics, rf_metrics])
results

Unnamed: 0,model,accuracy,recall,f1_score
0,Logistic Regression,0.936,0.982906,0.966387
1,Random Forest,0.952,0.991453,0.97479


In [20]:
plot_confusion_matrix(
    log_model,
    X_test_scaled,
    y_test,
    title="Logistic Regression Confusion Matrix"
)

NameError: name 'Confusion_Matrix' is not defined