In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1. Load the CSV file
df = pd.read_csv('nigfl.csv', index_col=0)

# List of columns to be treated as categorical (factors)
categorical_columns = ['television', 'motorbike', 'bicycle', 'radio', 'computer', 'telephone', 'hle', 'hospital_1m', 'hosp_adm', 'disability', 'agrishort', 'job7d', 'urban', 'loan', 'refrigerator', 'home_ownership', 'toilet', 'drinking_water_rainy']

# Convert specified columns to categorical dtype
df[categorical_columns] = df[categorical_columns].astype('category')

# 2. Preprocess the data
X = df.drop(columns=['folate_mcg'])  # Features
y = df['folate_mcg']  # Target variable


# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Calculate class weights to handle class imbalance
class_weights = y_train.value_counts(normalize=True).to_dict()
class_weights = {0: 1.0, 1: class_weights[0] / class_weights[1]}  # Custom class weights (can be omitted as 'balanced' is used)

# 5. Create preprocessing steps
# For numerical data: impute missing values and scale
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X.select_dtypes(include=['category']).columns.tolist()

numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# For categorical data: impute missing values and encode
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Updated parameter
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ]
)

# 6. Build and tune models with different regularizations
# Create a pipeline for Logistic Regression
model = LogisticRegression(solver='saga', class_weight='balanced')  # 'saga' supports Elastic Net

# Set up parameter grids for different penalties
param_grids = {
    'l1': {
        'model__penalty': ['l1'],
        'model__C': [0.001, 0.01, 0.1, 1, 10]
    },
    'l2': {
        'model__penalty': ['l2'],
        'model__C': [0.001, 0.01, 0.1, 1, 10]
    },
    'elasticnet': {
        'model__penalty': ['elasticnet'],
        'model__C': [0.001, 0.01, 0.1, 1, 10],
        'model__l1_ratio': [0, 0.5, 1]  # Only for Elastic Net
    }
}

# Define a function to run GridSearchCV with the given parameter grid
def run_grid_search(param_grid):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1', verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search

# Run GridSearchCV for each penalty type and store results
results = {}
for penalty, param_grid in param_grids.items():
    print(f"Running GridSearchCV for penalty: {penalty}")
    grid_search = run_grid_search(param_grid)
    results[penalty] = grid_search

# Select the best model based on the best score
best_penalty = max(results, key=lambda penalty: results[penalty].best_score_)
best_model = results[best_penalty].best_estimator_

# 7. Predict on the test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class

# 8. Evaluate the model
f1 = f1_score(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

print(f'F1 Score: {f1}')
print(f'Balanced Accuracy: {balanced_acc}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'AUC-ROC: {auc_roc}')

# Optionally, print the best parameters found by GridSearchCV
print(f'Best Parameters: {results[best_penalty].best_params_}')


Running GridSearchCV for penalty: l1
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Running GridSearchCV for penalty: l2
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Running GridSearchCV for penalty: elasticnet
Fitting 5 folds for each of 15 candidates, totalling 75 fits
F1 Score: 0.6868188361093975
Balanced Accuracy: 0.6532287077061716
Precision: 0.6999161777032691
Recall: 0.6742026645135244
AUC-ROC: 0.7187236079896009
Best Parameters: {'model__C': 0.01, 'model__penalty': 'l2'}
