In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

train = pd.read_csv('/Users/vaibhavrangan/Downloads/Stat_303-2/Datasets/social_network_ads_train.csv')
test = pd.read_csv('/Users/vaibhavrangan/Downloads/Stat_303-2/Datasets/social_network_ads_test.csv')

In [2]:
# convert gender to numeric and drop userID
train['Gender'] = train['Gender'].map({'Female': 0, 'Male': 1})
test['Gender'] = test['Gender'].map({'Female': 0, 'Male': 1})

train = train.drop('User ID', axis=1)
test = test.drop('User ID', axis=1)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score  
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


train["Age_Bin"] = pd.cut(train["Age"], bins=5, labels=False)
test["Age_Bin"] = pd.cut(test["Age"], bins=5, labels=False)
train["Salary_Bin"] = pd.cut(train["EstimatedSalary"], bins=5, labels=False)
test["Salary_Bin"] = pd.cut(test["EstimatedSalary"], bins=5, labels=False)

categorical_features = ["Gender"]
numeric_features = ["Age", "EstimatedSalary", "Age_Bin", "Salary_Bin"]

train["AgeBin_SalaryBin"] = train["Age_Bin"] * train["Salary_Bin"]
test["AgeBin_SalaryBin"] = test["Age_Bin"] * test["Salary_Bin"]

# Add interaction term to feature list
numeric_features.append("AgeBin_SalaryBin")

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)  
])

X_train_transformed = preprocessor.fit_transform(train[numeric_features + categorical_features])
X_test_transformed = preprocessor.transform(test[numeric_features + categorical_features])

y_train = train["Purchased"]
y_test = test["Purchased"]

model = LogisticRegression(penalty=None, solver='newton-cg')
model.fit(X_train_transformed, y_train)

y_pred = model.predict(X_test_transformed)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.93


In [None]:
from sklearn.model_selection import TunedThresholdClassifierCV
from sklearn.model_selection import StratifiedKFold

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# Define TunedThresholdClassifierCV
tuned_clf = TunedThresholdClassifierCV(
    estimator=model,
    scoring="accuracy",  
    cv=cv
)

# Fit the model
tuned_clf.fit(X_train_transformed, y_train)


# Print the best threshold and the corresponding score
print(f"Best threshold: {tuned_clf.best_threshold_:.2f}")
print(f"Best accuracy: {tuned_clf.best_score_:.4f}")

 threshold  accuracy
       0.1  0.806667
       0.2  0.866667
       0.3  0.900000
       0.4  0.893333
       0.5  0.890000
       0.6  0.876667
       0.7  0.846667
       0.8  0.826667


In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define Preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Define Parameter Grid
param_grid = [
    {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
     'logisticregression__penalty': ['l1', 'l2'],
     'logisticregression__solver': ['liblinear']},

    {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
     'logisticregression__penalty': ['elasticnet'],
     'logisticregression__solver': ['saga'],
     'logisticregression__l1_ratio': [0.1, 0.5, 0.9]},

    {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
     'logisticregression__penalty': ['none'],
     'logisticregression__solver': ['saga']}
]

# Define Pipeline
pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the Model
grid_search.fit(train[numeric_features + categorical_features], train["Purchased"])

# Get Best Model and Parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict Using Best Model
y_pred_best = best_model.predict(test[numeric_features + categorical_features])

# Compute Accuracy
best_accuracy = accuracy_score(test["Purchased"], y_pred_best)

# Print Results
print("Best Parameters:", best_params)
print("Best Tuned Model Accuracy:", best_accuracy)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'logisticregression__C': 100, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
Best Tuned Model Accuracy: 0.93


30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/vaibhavrangan/Downloads/Stat_303-2/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/vaibhavrangan/Downloads/Stat_303-2/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vaibhavrangan/Downloads/Stat_303-2/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt,

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
import numpy as np

# Load dataset
X, y = load_iris(return_X_y=True)

# Define pipeline
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

# Define hyperparameter grid
param_distributions = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2', 'elasticnet'],
    'logisticregression__solver': ['liblinear', 'saga'],
    'logisticregression__l1_ratio': [0.1, 0.5, 0.9]  # Only used for 'elasticnet'
}

# Create RandomizedSearchCV with n_iter=20
random_search = RandomizedSearchCV(
    pipeline, param_distributions, n_iter=20, cv=5, 
    scoring='accuracy', n_jobs=-1, verbose=1, random_state=42
)

# Fit the model
random_search.fit(X, y)

# Print best parameters
print("Best Parameters:", random_search.best_params_)
print("Best Model Accuracy:", random_search.best_score_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'logisticregression__solver': 'saga', 'logisticregression__penalty': 'elasticnet', 'logisticregression__l1_ratio': 0.1, 'logisticregression__C': 10}
Best Model Accuracy: 0.9733333333333334


20 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/vaibhavrangan/Downloads/Stat_303-2/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/vaibhavrangan/Downloads/Stat_303-2/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vaibhavrangan/Downloads/Stat_303-2/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt