In [1]:
# 1. Import pandas
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression



# 2. Load the saved CSVs
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# 3. (Optional but important) - if y_train and y_test are DataFrames, squeeze to make them Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# 4. Verify shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(22500, 23)
(7500, 23)
(22500,)
(7500,)


In [2]:
logreg = linear_model.LogisticRegression(penalty = None)
logreg.fit(X_train, y_train)
w_logreg = logreg.coef_
intercept_logreg = logreg.intercept_
print('Q18 - w_logreg: ', w_logreg)
print('Q18 - intercept_logreg: ', intercept_logreg)
y_hat_logreg = logreg.predict(X_test)

# Find the accuracy achieved on test set using logreg.score and y_test 
acc_logreg = logreg.score(X_test, y_test)

print("Q19 - Accuracy on training data = %f" % acc_logreg)

# TODO Q20
# Find Precision, recall and fscore using precision_recall_fscore_support method of sklearn
# Using y_test and y_hat_logreg
prec, recal, fscore, sup = precision_recall_fscore_support(y_test, y_hat_logreg, average='binary')

print('Q20 - prec: ', prec)
print('Q20 - recal: ', recal)
print('Q20 - fscore: ', fscore)

Q18 - w_logreg:  [[-0.10278017 -0.05212978 -0.07538191 -0.0836807   0.07350815  0.65359134
   0.10919094  0.090552    0.03963627  0.02866674  0.01787741 -0.406916
   0.16856665  0.14502597 -0.06159572  0.00657866  0.02513549 -0.17581801
  -0.23733501 -0.03878504 -0.04188511 -0.0310234  -0.05065429]]
Q18 - intercept_logreg:  [-1.45570814]
Q19 - Accuracy on training data = 0.808800
Q20 - prec:  0.6714031971580817
Q20 - recal:  0.23232944068838352
Q20 - fscore:  0.3452054794520548


In [None]:
def run_logistic_regression(X_train, X_test, y_train, y_test, 
                             regularization_value=1.0, 
                             feature_transform=None,
                             degree=2):
    """
    Runs logistic regression with optional feature transformation.
    
    Parameters:
    - X_train, X_test, y_train, y_test: Training and testing data
    - regularization_value: Inverse of regularization strength (C parameter)
    - feature_transform: None, 'pca', or 'polynomial'
    - degree: Degree for polynomial features if feature_transform='polynomial'
    
    Returns:
    - acc_logreg: Accuracy on test set
    - prec: Precision
    - recall: Recall
    - fscore: F1 score
    """

    # Feature Transformation
    if feature_transform == 'pca':
        pca = PCA(n_components=min(X_train.shape[1], 10))  # Limit components
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        
    elif feature_transform == 'polynomial':
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_train = poly.fit_transform(X_train)
        X_test = poly.transform(X_test)

    # Logistic Regression Model
    logreg = LogisticRegression(C=regularization_value, penalty='l2', max_iter=1000)
    logreg.fit(X_train, y_train)
    
    # Predict
    y_hat_logreg = logreg.predict(X_test)
    
    # Evaluate
    acc_logreg = accuracy_score(y_test, y_hat_logreg)
    prec, recall, fscore, _ = precision_recall_fscore_support(
        y_test, y_hat_logreg, average='binary')
    
    return acc_logreg, prec, recall, fscore





In [4]:
 run_logistic_regression(X_train, X_test, y_train, y_test, 
                             regularization_value=1.0, 
                             feature_transform=None,
                             degree=2)

accuracy:  0.8088 precision:  0.6714031971580817 recall:  0.23232944068838352 fscore:  0.3452054794520548


(0.8088, 0.6714031971580817, 0.23232944068838352, 0.3452054794520548)