In [1]:
# Q5: Compare Linear vs Logistic
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

# Toy dataset: HoursStudy -> ExamScore and Pass (score >= 50 -> pass)
hours = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1)
scores = np.array([30,35,40,45,52,60,65,72,80,90])  # exam scores
pass_label = (scores >= 50).astype(int)

# Linear regression to predict scores
lr = LinearRegression().fit(hours, scores)
pred_scores = lr.predict(hours)

# If you naively threshold linear predictions to classify:
pred_class_from_linear = (pred_scores >= 50).astype(int)

# Logistic regression (proper classifier)
clf = LogisticRegression(solver='liblinear').fit(hours, pass_label)
pred_class_log = clf.predict(hours)

print("Linear regression MSE:", mean_squared_error(scores, pred_scores))
print("Accuracy of naive linear->threshold classifier:", accuracy_score(pass_label, pred_class_from_linear))
print("Accuracy of logistic classifier:", accuracy_score(pass_label, pred_class_log))

# Show why linear is not ideal: predictions not bounded [0,1] and not probabilistic
print("Some linear predictions (not bounded between 0 and 1):", pred_scores[:5])
print("Logistic predicted probabilities:", clf.predict_proba(hours)[:5,1])


Linear regression MSE: 3.2690909090909117
Accuracy of naive linear->threshold classifier: 1.0
Accuracy of logistic classifier: 0.8
Some linear predictions (not bounded between 0 and 1): [27.36363636 33.92727273 40.49090909 47.05454545 53.61818182]
Logistic predicted probabilities: [0.33937886 0.43384898 0.53338395 0.63033025 0.7177915 ]
