In [1]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available, running on CPU.")


GPU is available: Tesla T4


In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import precision_score, recall_score, log_loss, classification_report

# 3. Load Dataset
df = pd.read_csv('Micro-credit-Data-file.csv')  # Update path if needed

# 4. Preprocessing
df.dropna(inplace=True)  # Drop rows with missing values

# Convert categorical features using LabelEncoder or pd.get_dummies
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Set features and target
X = df.drop('label', axis=1)  # Update if your target column has a different name
y = df['label']

# 5. Train-Test Split and Scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

models = [
    LogisticRegression(penalty='l2', solver='lbfgs', max_iter=300),
    LogisticRegression(penalty='none', solver='lbfgs', max_iter=300),
    LogisticRegression(penalty='l2', solver='saga', max_iter=300),
    SGDClassifier(loss="log_loss", max_iter=300, tol=1e-3),
    SGDClassifier(loss="hinge", max_iter=300, tol=1e-3),
    SGDClassifier(loss="modified_huber", max_iter=300, tol=1e-3),
    RidgeClassifier(),
    PassiveAggressiveClassifier(max_iter=300),
    Perceptron(max_iter=300),
    LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=300),

    GaussianNB(),
    MultinomialNB(),
    ComplementNB(),
    BernoulliNB(),

    DecisionTreeClassifier(max_depth=3),
    DecisionTreeClassifier(max_depth=5),
    DecisionTreeClassifier(max_depth=7),
    RandomForestClassifier(n_estimators=10, max_depth=5),
    RandomForestClassifier(n_estimators=20, max_depth=5),
    ExtraTreesClassifier(n_estimators=10, max_depth=5),
    ExtraTreesClassifier(n_estimators=20, max_depth=5),
    BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=3), n_estimators=10),
    BaggingClassifier(estimator=LogisticRegression(max_iter=300), n_estimators=10),


    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=7),
    NearestCentroid(),
    RadiusNeighborsClassifier(radius=1.0),

    GradientBoostingClassifier(n_estimators=20, max_depth=3),
    GradientBoostingClassifier(n_estimators=30, max_depth=3),
    HistGradientBoostingClassifier(max_iter=20),
    HistGradientBoostingClassifier(max_iter=30),
    XGBClassifier(n_estimators=20, use_label_encoder=False, eval_metric='logloss', verbosity=0),
    LGBMClassifier(n_estimators=20),

    DummyClassifier(strategy='most_frequent'),
    DummyClassifier(strategy='stratified'),
    DummyClassifier(strategy='uniform'),
    DummyClassifier(strategy='constant', constant=1),
    DummyClassifier(strategy='constant', constant=0)
]


In [4]:
results = []

for i, model in enumerate(models, 1):
    model_name = type(model).__name__
    try:
        start = time.time()
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled) if hasattr(model, "predict_proba") else None
        end = time.time()

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        logloss = log_loss(y_test, y_prob) if y_prob is not None else np.nan
        elapsed = end - start

        results.append({
            'Model': model_name,
            'Precision': precision,
            'Recall': recall,
            'Log Loss': logloss,
            'Time (s)': elapsed
        })

        print(f"{i:02d}. {model_name} ✔️ — Time: {elapsed:.2f}s")

    except Exception as e:
        print(f"{i:02d}. {model_name} ❌ Failed: {e}")
        results.append({
            'Model': model_name,
            'Precision': np.nan,
            'Recall': np.nan,
            'Log Loss': np.nan,
            'Time (s)': np.nan
        })


01. LogisticRegression ✔️ — Time: 3.90s
02. LogisticRegression ❌ Failed: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet'} or None. Got 'none' instead.




03. LogisticRegression ✔️ — Time: 41.61s
04. SGDClassifier ✔️ — Time: 0.73s
05. SGDClassifier ✔️ — Time: 1.01s
06. SGDClassifier ✔️ — Time: 2.10s
07. RidgeClassifier ✔️ — Time: 0.12s
08. PassiveAggressiveClassifier ✔️ — Time: 0.50s
09. Perceptron ✔️ — Time: 0.31s




10. LogisticRegression ✔️ — Time: 48.66s
11. GaussianNB ✔️ — Time: 0.15s
12. MultinomialNB ❌ Failed: Negative values in data passed to MultinomialNB (input X).
13. ComplementNB ❌ Failed: Negative values in data passed to ComplementNB (input X).
14. BernoulliNB ✔️ — Time: 0.18s
15. DecisionTreeClassifier ✔️ — Time: 0.95s
16. DecisionTreeClassifier ✔️ — Time: 1.36s
17. DecisionTreeClassifier ✔️ — Time: 1.76s
18. RandomForestClassifier ✔️ — Time: 1.71s
19. RandomForestClassifier ✔️ — Time: 4.29s
20. ExtraTreesClassifier ✔️ — Time: 0.30s
21. ExtraTreesClassifier ✔️ — Time: 0.59s
22. BaggingClassifier ✔️ — Time: 6.23s
23. BaggingClassifier ✔️ — Time: 39.87s
24. KNeighborsClassifier ✔️ — Time: 88.73s
25. KNeighborsClassifier ✔️ — Time: 87.61s
26. KNeighborsClassifier ✔️ — Time: 86.22s
27. NearestCentroid ✔️ — Time: 0.14s




28. RadiusNeighborsClassifier ❌ Failed: No neighbors found for test samples array([    1,     7,    14, ..., 41908, 41910, 41915]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
29. GradientBoostingClassifier ✔️ — Time: 19.61s
30. GradientBoostingClassifier ✔️ — Time: 28.77s
31. HistGradientBoostingClassifier ✔️ — Time: 1.71s
32. HistGradientBoostingClassifier ✔️ — Time: 2.78s
33. XGBClassifier ✔️ — Time: 1.36s




[LightGBM] [Info] Number of positive: 146744, number of negative: 20930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6152
[LightGBM] [Info] Number of data points in the train set: 167674, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.875174 -> initscore=1.947506
[LightGBM] [Info] Start training from score 1.947506
34. LGBMClassifier ✔️ — Time: 1.67s
35. DummyClassifier ✔️ — Time: 0.00s
36. DummyClassifier ✔️ — Time: 0.00s
37. DummyClassifier ✔️ — Time: 0.00s
38. DummyClassifier ✔️ — Time: 0.00s
39. DummyClassifier ✔️ — Time: 0.00s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Log Loss', ascending=True).reset_index(drop=True)
results_df.style.background_gradient(cmap='Blues')


Unnamed: 0,Model,Precision,Recall,Log Loss,Time (s)
0,XGBClassifier,0.934732,0.977485,0.19607,1.359246
1,HistGradientBoostingClassifier,0.933548,0.979148,0.199815,2.780254
2,HistGradientBoostingClassifier,0.928338,0.983046,0.210503,1.705726
3,LGBMClassifier,0.928362,0.984109,0.210669,1.673996
4,DecisionTreeClassifier,0.922815,0.981247,0.220118,1.763219
5,GradientBoostingClassifier,0.914527,0.989261,0.227846,28.772829
6,DecisionTreeClassifier,0.918772,0.984136,0.237053,1.36305
7,GradientBoostingClassifier,0.902793,0.995639,0.241779,19.609189
8,RandomForestClassifier,0.904737,0.99433,0.254041,4.294132
9,RandomForestClassifier,0.901528,0.995448,0.25669,1.707813


In [7]:
results_df.to_csv('model_comparison_results.csv', index=False)


In [8]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.preprocessing import MinMaxScaler

# Prepare data for Naive Bayes
minmax_scaler = MinMaxScaler()
X_train_nb = minmax_scaler.fit_transform(X_train)
X_test_nb = minmax_scaler.transform(X_test)

# Define models
nb_models = [
    ('MultinomialNB', MultinomialNB()),
    ('ComplementNB', ComplementNB())
]

# Evaluate them
for name, model in nb_models:
    try:
        start = time.time()
        model.fit(X_train_nb, y_train)
        y_pred = model.predict(X_test_nb)
        y_prob = model.predict_proba(X_test_nb)
        end = time.time()

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        logloss = log_loss(y_test, y_prob)
        elapsed = end - start

        results_df.loc[len(results_df)] = [name, precision, recall, logloss, elapsed]
        print(f"{name} ✔️ Reprocessed — Time: {elapsed:.2f}s")

    except Exception as e:
        print(f"{name} ❌ Error after fix: {e}")


MultinomialNB ✔️ Reprocessed — Time: 0.05s
ComplementNB ✔️ Reprocessed — Time: 0.08s


In [None]:
from sklearn.neighbors import RadiusNeighborsClassifier

try:
    model_name = 'RadiusNeighborsClassifier'
    start = time.time()
    radius_model = RadiusNeighborsClassifier(radius=10.0)
    radius_model.fit(X_train_scaled, y_train)
    y_pred = radius_model.predict(X_test_scaled)
    y_prob = radius_model.predict_proba(X_test_scaled)
    end = time.time()

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    logloss = log_loss(y_test, y_prob)
    elapsed = end - start

    results_df.loc[len(results_df)] = [model_name, precision, recall, logloss, elapsed]
    print(f"{model_name} ✔️ Reprocessed — Time: {elapsed:.2f}s")

except Exception as e:
    print(f"{model_name} ❌ Error after fix: {e}")


In [1]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Log Loss', ascending=True).reset_index(drop=True)
results_df.style.background_gradient(cmap='Blues')


NameError: name 'pd' is not defined

In [2]:
import joblib

# Replace with your actual trained model variable
final_model = XGBClassifier()
final_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(final_model, 'final_model.pkl')

# If you used a scaler
joblib.dump(scaler, 'scaler.pkl')


NameError: name 'XGBClassifier' is not defined

In [3]:
import pandas as pd

results_df = pd.read_csv('model_comparison_results.csv')


In [4]:
# Example: Select model with highest recall
best_model = results_df.loc[results_df['Recall'].idxmax()]

print("Best model based on Recall:")
print(best_model)


Best model based on Recall:
Model        ExtraTreesClassifier
Precision                0.875188
Recall                        1.0
Log Loss                 0.331129
Time (s)                 0.593977
Name: 17, dtype: object


In [6]:
import pandas as pd

# Load the CSV file containing model evaluation metrics
results_df = pd.read_csv('model_comparison_results.csv')

# Drop rows where Precision is NaN (if any)
results_df = results_df.dropna(subset=['Precision'])

# Find the row with the highest Precision
best_model_row = results_df.loc[results_df['Precision'].idxmax()]

print("Best model based on Precision:")
print(best_model_row)


Best model based on Precision:
Model        GaussianNB
Precision      0.969805
Recall         0.546297
Log Loss        3.44679
Time (s)       0.153859
Name: 26, dtype: object


In [15]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load your results CSV
results_df = pd.read_csv('model_comparison_results.csv')

# Drop rows with NaN in critical columns
results_df = results_df.dropna(subset=['Precision', 'Recall', ])

# Min-max scale Precision, Recall, and LogLoss (LogLoss is to be minimized, so invert it)
scaler = MinMaxScaler()

# Scale Precision and Recall normally
results_df[['Precision_scaled', 'Recall_scaled']] = scaler.fit_transform(results_df[['Precision', 'Recall']])



# Define weights (you can adjust these as per your priority)
w_precision = 0.4
w_recall = 0.4


# Calculate combined score
results_df['Combined_Score'] = (
    w_precision * results_df['Precision_scaled'] +
    w_recall * results_df['Recall_scaled']

)

# Select the best model
best_model_row = results_df.loc[results_df['Combined_Score'].idxmax()]

print("Best model balancing Precision, Recall, and Log Loss:")
print(best_model_row)

# Get model name
best_model_name = best_model_row['Model']  # Adjust if your column name is different
print(f"Selected model: {best_model_name}")


Best model balancing Precision, Recall, and Log Loss:
Model               HistGradientBoostingClassifier
Precision                                 0.933548
Recall                                    0.979148
Log Loss                                  0.199815
Time (s)                                  2.780254
Precision_scaled                          0.962614
Recall_scaled                             0.979148
Combined_Score                            0.776705
Name: 1, dtype: object
Selected model: HistGradientBoostingClassifier


In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load your CSV results
results_df = pd.read_csv('model_comparison_results.csv')

# Drop rows with NaN in critical columns
results_df = results_df.dropna(subset=['Precision', 'Recall', 'Time'])

# Min-max scale Precision and Recall normally
scaler = MinMaxScaler()
results_df[['Precision_scaled', 'Recall_scaled']] = scaler.fit_transform(results_df[['Precision', 'Recall']])

# Scale time and invert because lower time is better
time_scaled = scaler.fit_transform(results_df[['Time']])
results_df['Time_scaled'] = 1 - time_scaled  # invert to make higher better

# Weights for each metric - adjust if needed
w_precision = 0.4
w_recall = 0.4
w_time = 0.2

# Combined score calculation
results_df['Combined_Score'] = (
    w_precision * results_df['Precision_scaled'] +
    w_recall * results_df['Recall_scaled'] +
    w_time * results_df['Time_scaled']
)

# Find best model based on combined score
best_model_row = results_df.loc[results_df['Combined_Score'].idxmax()]

print("Best model balancing Precision, Recall, and Execution Time:")
print(best_model_row)

# Model name
best_model_name = best_model_row['Model']
print(f"Selected model: {best_model_name}")


KeyError: ['Time']

In [19]:
import pandas as pd

# Replace 'results.csv' with your actual file path
df = pd.read_csv('model_comparison_results.csv')


In [20]:
print(df.head())


                            Model  Precision    Recall  Log Loss  Time (s)
0                   XGBClassifier   0.934732  0.977485  0.196070  1.359246
1  HistGradientBoostingClassifier   0.933548  0.979148  0.199815  2.780254
2  HistGradientBoostingClassifier   0.928338  0.983046  0.210503  1.705726
3                  LGBMClassifier   0.928362  0.984109  0.210669  1.673996
4          DecisionTreeClassifier   0.922815  0.981247  0.220118  1.763219


In [21]:
df_sorted = df.sort_values(by=['Precision', 'Recall', 'PredictionTime'], ascending=[False, False, True])


KeyError: 'PredictionTime'

In [23]:
import pandas as pd

# Load the CSV file (replace 'results.csv' with your actual file path)
df = pd.read_csv('model_comparison_results.csv')

# Display first few rows to verify
print(df.head())

# Sort by Precision (desc), Recall (desc), and Time (asc)
df_sorted = df.sort_values(by=['Precision', 'Recall', 'Time (s)'], ascending=[False, False, True])

# Get the best model row
best_model_row = df_sorted.iloc[0]
print("Best model based on Precision, Recall and Time:")
print(best_model_row)

# Extract best model name
best_model_name = best_model_row['Model']

print(f"Selected model: {best_model_name}")


                            Model  Precision    Recall  Log Loss  Time (s)
0                   XGBClassifier   0.934732  0.977485  0.196070  1.359246
1  HistGradientBoostingClassifier   0.933548  0.979148  0.199815  2.780254
2  HistGradientBoostingClassifier   0.928338  0.983046  0.210503  1.705726
3                  LGBMClassifier   0.928362  0.984109  0.210669  1.673996
4          DecisionTreeClassifier   0.922815  0.981247  0.220118  1.763219
Best model based on Precision, Recall and Time:
Model        GaussianNB
Precision      0.969805
Recall         0.546297
Log Loss        3.44679
Time (s)       0.153859
Name: 26, dtype: object
Selected model: GaussianNB


In [24]:
import joblib

model_filename = f"{best_model_name}.pkl"
best_model = joblib.load(model_filename)


FileNotFoundError: [Errno 2] No such file or directory: 'GaussianNB.pkl'

In [25]:
import joblib

model_filename = f"{best_model_name}.pkl"
best_model = joblib.load(model_filename)


FileNotFoundError: [Errno 2] No such file or directory: 'GaussianNB.pkl'

In [26]:
import joblib
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'GaussianNB.pkl')
print("Model saved as GaussianNB.pkl")


NameError: name 'X_train' is not defined

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import joblib

# Example: Load your dataset CSV again
data = pd.read_csv('Micro-credit-Data-file.csv')

# Replace 'features' and 'target' with your actual columns
X = data.drop('target_column', axis=1)
y = data['target_column']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = GaussianNB()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'GaussianNB.pkl')
print("Model saved!")


KeyError: "['target_column'] not found in axis"

In [29]:
from sklearn.naive_bayes import GaussianNB
import joblib

# Assuming your earlier code has run and you have these:
# X_train_scaled, y_train, X_test_scaled, y_test

model = GaussianNB()
model.fit(X_train_scaled, y_train)

# Save the model for later use
joblib.dump(model, 'GaussianNB.pkl')

# Predict and evaluate
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

# If you want to load the model later:
# loaded_model = joblib.load('GaussianNB.pkl')


NameError: name 'X_train_scaled' is not defined

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load Dataset
df = pd.read_csv('Micro-credit-Data-file.csv')  # Adjust path if needed

# Preprocessing
df.dropna(inplace=True)

# Convert categorical columns
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop('label', axis=1)
y = df['label']

# Train-test split and scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [31]:
from sklearn.naive_bayes import GaussianNB
import joblib

model = GaussianNB()
model.fit(X_train_scaled, y_train)

joblib.dump(model, 'GaussianNB.pkl')

y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


NameError: name 'classification_report' is not defined

In [32]:
from sklearn.metrics import classification_report



In [33]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import joblib

model = GaussianNB()
model.fit(X_train_scaled, y_train)

joblib.dump(model, 'GaussianNB.pkl')

y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.22      0.88      0.35      5232
           1       0.97      0.55      0.70     36687

    accuracy                           0.59     41919
   macro avg       0.59      0.71      0.52     41919
weighted avg       0.88      0.59      0.66     41919



In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, log_loss
import joblib

# Initialize model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Fit model on scaled training data
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Predict probabilities for log loss
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Print classification report
print("XGBClassifier Results:")
print(classification_report(y_test, y_pred))

# Print precision, recall, log loss
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_pred_proba))

# Save the model
joblib.dump(model, 'XGBClassifier.pkl')
print("Model saved as XGBClassifier.pkl")


Parameters: { "use_label_encoder" } are not used.



XGBClassifier Results:
              precision    recall  f1-score   support

           0       0.75      0.57      0.64      5232
           1       0.94      0.97      0.96     36687

    accuracy                           0.92     41919
   macro avg       0.84      0.77      0.80     41919
weighted avg       0.92      0.92      0.92     41919

Precision: 0.9401833219196123
Recall: 0.9729604492054407
Log Loss: 0.18963605712718073
Model saved as XGBClassifier.pkl


In [35]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, log_loss
import time

results = []

# QDA
start = time.time()
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train_scaled, y_train)
y_pred = qda.predict(X_test_scaled)
y_prob = qda.predict_proba(X_test_scaled)
results.append({
    "Model": "QuadraticDiscriminantAnalysis",
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "Log Loss": log_loss(y_test, y_prob),
    "Time (s)": time.time() - start
})

# LDA
start = time.time()
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled, y_train)
y_pred = lda.predict(X_test_scaled)
y_prob = lda.predict_proba(X_test_scaled)
results.append({
    "Model": "LinearDiscriminantAnalysis",
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "Log Loss": log_loss(y_test, y_prob),
    "Time (s)": time.time() - start
})

# DummyClassifier (stratified)
start = time.time()
dummy = DummyClassifier(strategy="stratified")
dummy.fit(X_train_scaled, y_train)
y_pred = dummy.predict(X_test_scaled)
y_prob = dummy.predict_proba(X_test_scaled)
results.append({
    "Model": "DummyClassifier-stratified",
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "Log Loss": log_loss(y_test, y_prob),
    "Time (s)": time.time() - start
})

# Calibrated Classifier (with RidgeClassifier)
start = time.time()
ridge = RidgeClassifier()
calib = CalibratedClassifierCV(ridge)
calib.fit(X_train_scaled, y_train)
y_pred = calib.predict(X_test_scaled)
y_prob = calib.predict_proba(X_test_scaled)
results.append({
    "Model": "Calibrated-RidgeClassifier",
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "Log Loss": log_loss(y_test, y_prob),
    "Time (s)": time.time() - start
})

# GaussianNB (Binarized)
from sklearn.preprocessing import Binarizer
start = time.time()
binarizer = Binarizer().fit(X_train_scaled)
X_train_bin = binarizer.transform(X_train_scaled)
X_test_bin = binarizer.transform(X_test_scaled)
gnb = GaussianNB()
gnb.fit(X_train_bin, y_train)
y_pred = gnb.predict(X_test_bin)
y_prob = gnb.predict_proba(X_test_bin)
results.append({
    "Model": "GaussianNB-Binarized",
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "Log Loss": log_loss(y_test, y_prob),
    "Time (s)": time.time() - start
})

# Create DataFrame of new results
import pandas as pd
new_models_df = pd.DataFrame(results)
print(new_models_df)




                           Model  Precision    Recall  Log Loss  Time (s)
0  QuadraticDiscriminantAnalysis   0.977794  0.446480  5.611335  1.330966
1     LinearDiscriminantAnalysis   0.875418  0.999428  0.312908  1.819527
2     DummyClassifier-stratified   0.875521  0.876141  7.913112  0.055706
3     Calibrated-RidgeClassifier   0.877495  0.993404  0.311834  3.442340
4           GaussianNB-Binarized   0.961228  0.669011  2.738266  0.401710


In [36]:
final_df = pd.concat([existing_df, new_models_df], ignore_index=True)
final_df.to_csv('updated_models_metrics.csv', index=False)


NameError: name 'existing_df' is not defined

In [38]:
import pandas as pd

# Load the existing model results CSV
existing_df = pd.read_csv('model_comparison_results.csv')  # Replace with your actual filename if different

# Now concatenate with the new models
final_df = pd.concat([existing_df, new_models_df], ignore_index=True)

# Save the updated list of models
final_df.to_csv('updated_models_metrics.csv', index=False)

print("✅ Model list updated and saved as 'updated_models_metrics.csv'")


✅ Model list updated and saved as 'updated_models_metrics.csv'


In [39]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import precision_score, recall_score, log_loss, classification_report
import time

# Start timing
start = time.time()

# Initialize and train the model
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train_scaled, y_train)

# Predict
y_pred = qda_model.predict(X_test_scaled)
y_proba = qda_model.predict_proba(X_test_scaled)

# Evaluate
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
loss = log_loss(y_test, y_proba)
duration = time.time() - start

print("QDA Results:")
print(classification_report(y_test, y_pred))
print("Precision:", precision)
print("Recall:", recall)
print("Log Loss:", loss)
print("Time Taken:", duration)

# Add to DataFrame if you're collecting model results
new_model_result = pd.DataFrame([{
    'Model': 'QuadraticDiscriminantAnalysis',
    'Precision': precision,
    'Recall': recall,
    'Log Loss': loss,
    'Time (s)': duration
}])

# If you already have a DataFrame of results (like existing_df or final_df):
# final_df = pd.concat([final_df, new_model_result], ignore_index=True)




QDA Results:
              precision    recall  f1-score   support

           0       0.19      0.93      0.32      5232
           1       0.98      0.45      0.61     36687

    accuracy                           0.51     41919
   macro avg       0.59      0.69      0.47     41919
weighted avg       0.88      0.51      0.58     41919

Precision: 0.9777936962750716
Recall: 0.44647967945048656
Log Loss: 5.611335371070336
Time Taken: 0.9648327827453613




In [40]:
import pandas as pd

# Load existing results CSV
final_df = pd.read_csv('updated_models_metrics.csv')  # Adjust the filename if different


In [41]:
# Replace these with actual values from your QDA model run
qda_result = pd.DataFrame([{
    'Model': 'QuadraticDiscriminantAnalysis',
    'Precision': precision,  # from your QDA evaluation
    'Recall': recall,
    'Log Loss': loss,
    'Time (s)': duration
}])

# Concatenate and save
final_df = pd.concat([final_df, qda_result], ignore_index=True)
final_df.to_csv('updated_models_metrics.csv', index=False)


In [42]:
print(final_df.tail())  # Shows the last few rows including the new entry


                            Model  Precision    Recall  Log Loss  Time (s)
40     LinearDiscriminantAnalysis   0.875418  0.999428  0.312908  1.819527
41     DummyClassifier-stratified   0.875521  0.876141  7.913112  0.055706
42     Calibrated-RidgeClassifier   0.877495  0.993404  0.311834  3.442340
43           GaussianNB-Binarized   0.961228  0.669011  2.738266  0.401710
44  QuadraticDiscriminantAnalysis   0.977794  0.446480  5.611335  0.964833
