In [1]:
# 1. Import pandas
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression



# 2. Load the saved CSVs
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# 3. (Optional but important) - if y_train and y_test are DataFrames, squeeze to make them Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# 4. Verify shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(22449, 31)
(7483, 31)
(22449,)
(7483,)


In [2]:
logreg = linear_model.LogisticRegression(penalty = None)
logreg.fit(X_train, y_train)
w_logreg = logreg.coef_
intercept_logreg = logreg.intercept_
print('Q18 - w_logreg: ', w_logreg)
print('Q18 - intercept_logreg: ', intercept_logreg)
y_hat_logreg = logreg.predict(X_test)

# Find the accuracy achieved on test set using logreg.score and y_test 
acc_logreg = logreg.score(X_test, y_test)

print("Q19 - Accuracy on training data = %f" % acc_logreg)

# TODO Q20
# Find Precision, recall and fscore using precision_recall_fscore_support method of sklearn
# Using y_test and y_hat_logreg
prec, recal, fscore, sup = precision_recall_fscore_support(y_test, y_hat_logreg, average='binary')

print('Q20 - prec: ', prec)
print('Q20 - recal: ', recal)
print('Q20 - fscore: ', fscore)

Q18 - w_logreg:  [[-0.09144571  0.04813232  0.66387844  0.11132103  0.05620192  0.02105238
   0.04287396  0.01705268 -0.48163099  0.11769651  0.24062338  0.01904728
   0.01770346 -0.00862994 -0.19544778 -0.30863672 -0.01256714 -0.04386799
  -0.04296814 -0.0319785   0.02994752 -0.02994752  0.04092838  0.01268984
  -0.01752303 -0.08952477 -0.13677393 -0.0048192   0.05513189 -0.051769
  -0.01562738]]
Q18 - intercept_logreg:  [-1.47203668]
Q19 - Accuracy on training data = 0.808232
Q20 - prec:  0.6793103448275862
Q20 - recal:  0.2398052343274498
Q20 - fscore:  0.35447593342330186


In [3]:
def run_logistic_regression(X_train, X_test, y_train, y_test, 
                             regularization_value=1.0, 
                             feature_transform=None,
                             degree=2):
    """
    Runs logistic regression with optional feature transformation.
    
    Parameters:
    - X_train, X_test, y_train, y_test: Training and testing data
    - regularization_value: Inverse of regularization strength (C parameter)
    - feature_transform: None, 'pca', or 'polynomial'
    - degree: Degree for polynomial features if feature_transform='polynomial'
    
    Returns:
    - acc_logreg: Accuracy on test set
    - prec: Precision
    - recall: Recall
    - fscore: F1 score
    """

    # Feature Transformation
    if feature_transform == 'pca':
        pca = PCA(n_components=min(X_train.shape[1], 10))  # Limit components
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        
    elif feature_transform == 'polynomial':
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_train = poly.fit_transform(X_train)
        X_test = poly.transform(X_test)

    # Logistic Regression Model
    logreg = LogisticRegression(C=regularization_value, penalty='l2', max_iter=1000)
    logreg.fit(X_train, y_train)
    
    # Predict
    y_hat_logreg = logreg.predict(X_test)
    
    # Evaluate
    acc_logreg = accuracy_score(y_test, y_hat_logreg)
    prec, recall, fscore, _ = precision_recall_fscore_support(
        y_test, y_hat_logreg, average='binary')
    
    return acc_logreg, prec, recall, fscore


In [4]:
 run_logistic_regression(X_train, X_test, y_train, y_test, 
                             regularization_value=1.0, 
                             feature_transform=None,
                             degree=2)

(0.8082319925163705,
 0.6793103448275862,
 0.2398052343274498,
 0.35447593342330186)