In [51]:
# 1. Import pandas
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from utils import feature_transform 
from sklearn.metrics.pairwise import rbf_kernel





# 2. Load the saved CSVs
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# 3. (Optional but important) - if y_train and y_test are DataFrames, squeeze to make them Series
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# 4. Verify shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(747, 31)
(250, 31)
(747,)
(250,)


In [52]:
logreg = linear_model.LogisticRegression(penalty = None)
logreg.fit(X_train, y_train)
w_logreg = logreg.coef_
intercept_logreg = logreg.intercept_
print('Q18 - w_logreg: ', w_logreg)
print('Q18 - intercept_logreg: ', intercept_logreg)
y_hat_logreg = logreg.predict(X_test)

# Find the accuracy achieved on test set using logreg.score and y_test 
acc_logreg = logreg.score(X_test, y_test)

print("Q19 - Accuracy on training data = %f" % acc_logreg)

# TODO Q20


print('Q20 - prec: ', prec)
print('Q20 - recal: ', recal)
print('Q20 - fscore: ', fscore)

Q18 - w_logreg:  [[ 0.18458759 -0.08049333  0.65967944  0.17160387 -0.1336652   0.09562481
   0.00763028  0.10202784 -1.44520027  1.47488404  0.15478932  0.66014534
  -2.0635181   1.67083045 -0.52909525 -1.13490081 -0.92120242 -0.35406633
  -0.4769137  -0.13604357  0.02723772 -0.02723772  0.09473924 -0.014406
   0.03339916 -0.72611491  0.03901717  0.          0.03613793 -0.03329378
  -0.017095  ]]
Q18 - intercept_logreg:  [-1.81823517]
Q19 - Accuracy on training data = 0.788000
Q20 - prec:  0.6793103448275862
Q20 - recal:  0.2398052343274498
Q20 - fscore:  0.35447593342330186


In [53]:


def run_logistic_regression(X_train, X_test, y_train, y_test, 
                            regularization_value=1.0, 
                            feature_method=None,
                            degree=2, 
                            n_components=2, 
                            gamma=None):
    """
    Runs logistic regression with optional feature transformation.

    feature_method: None, 'pca', 'polynomial', or 'rbf'
    """

    # 1) PCA
    if feature_method == 'pca':
        pca = PCA(n_components=n_components)
        X_train = pca.fit_transform(X_train)
        X_test  = pca.transform(X_test)

    # 2) Polynomial
    elif feature_method == 'polynomial':
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_train = poly.fit_transform(X_train)
        X_test  = poly.transform(X_test)

    # 3) RBF kernel
    elif feature_method == 'rbf':
        # first compute the train–train kernel
        K_train = rbf_kernel(X_train, X_train, gamma=gamma)
        # then compute the test–train kernel
        K_test  = rbf_kernel(X_test,  X_train, gamma=gamma)
        X_train, X_test = K_train, K_test

    # 4) Train
    logreg = LogisticRegression(C=regularization_value, penalty='l2', max_iter=10000)
    logreg.fit(X_train, y_train)

    # 5) Predict & Evaluate
    y_hat = logreg.predict(X_test)
    acc   = accuracy_score(y_test, y_hat)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_hat, average='binary')

    return acc, prec, rec, f1






In [54]:
# No transformation
print("No transformation: ", run_logistic_regression(X_train, X_test, y_train, y_test))

# Example with polynomial transformation
print("Polynomial transformation: ", run_logistic_regression(X_train, X_test, y_train, y_test, 
                        regularization_value=1.0, 
                        feature_method='polynomial', degree=3))

# Example with PCA
print("PCA: ", run_logistic_regression(X_train, X_test, y_train, y_test, 
                        regularization_value=1.0, 
                        feature_method='pca', n_components=10))

# Example with RBF kernel
print("RBF Kernel: ", run_logistic_regression(X_train, X_test, y_train, y_test, 
                        regularization_value=1.0, 
                        feature_method='rbf', gamma=0.5))



No transformation:  (0.796, 0.7037037037037037, 0.3064516129032258, 0.42696629213483145)
Polynomial transformation:  (0.692, 0.38461538461538464, 0.4032258064516129, 0.3937007874015748)
PCA:  (0.74, 0.42105263157894735, 0.12903225806451613, 0.19753086419753085)
RBF Kernel:  (0.752, 0.5, 0.016129032258064516, 0.03125)
