Tisya Sharma
DS 4400
Homework #3

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('recipes_df.csv')
df.head(20)

Unnamed: 0,Protein,Sodium,Calcium,Potassium,Recipe
0,61,1113,1189,687,macncheese
1,18,524,368,229,macncheese
2,13,286,336,303,macncheese
3,28,1194,643,287,macncheese
4,28,988,773,273,macncheese
5,20,1204,332,265,macncheese
6,63,1767,915,913,macncheese
7,11,382,251,113,macncheese
8,27,777,568,380,macncheese
9,29,1233,161,633,macncheese


##### Protein Prediction 1 (a)

In [3]:
# Isolate protein column into its own array using numpy
y = np.array(df["Protein"])

# For y to be a column vector, reshape the array
# y = np.array(df["Protein"]).reshape(-1, 1)
print(y)

raw_X = df.drop(columns = "Protein")
raw_X.head(10)

[61 18 13 28 28 20 63 11 27 29 47 17 20 27 26 17 14 16 22 19 30 20  6  3
  6 71 12 39 18 17  6 59 24  9]


Unnamed: 0,Sodium,Calcium,Potassium,Recipe
0,1113,1189,687,macncheese
1,524,368,229,macncheese
2,286,336,303,macncheese
3,1194,643,287,macncheese
4,988,773,273,macncheese
5,1204,332,265,macncheese
6,1767,915,913,macncheese
7,382,251,113,macncheese
8,777,568,380,macncheese
9,1233,161,633,macncheese


##### Protein Prediction 1 (b)
Note: I one-hot encoded Recipe with drop_first=True to avoid multicollinearity given the required intercept; Recipe_pizza is interpreted relative to the baseline macncheese. Macncheese = 0, Pizza = 1

In [4]:
# One-hot encoding for the categorical variables
raw_X = pd.get_dummies(raw_X, columns = ['Recipe'], drop_first = True, dtype =int)

# Perform data scaling
scaler = StandardScaler()

# Identify numeric columns
numeric_cols = ["Sodium", "Calcium", "Potassium"]

# Fit the scaler on the those columns
raw_X[numeric_cols] = scaler.fit_transform(raw_X[numeric_cols])

# Add intercept and build Phi
raw_X.insert(0, "Intercept", 1)

# Convert to numpy for the final Phi
Phi = raw_X.to_numpy()

print("Shape of Phi:", Phi.shape)
print("Shape of y:", y.shape)

# Sanity check
raw_X.head()


Shape of Phi: (34, 5)
Shape of y: (34,)


Unnamed: 0,Intercept,Sodium,Calcium,Potassium,Recipe_pizza
0,1,0.429341,3.236758,1.540719,0
1,1,-0.521342,-0.063007,-0.499597,0
2,1,-0.905489,-0.191621,-0.169939,0
3,1,0.56008,1.042274,-0.241216,0
4,1,0.227583,1.56477,-0.303584,0


##### Protein Prediction 1 (c)

In [5]:
# assume Phi, y from (1b); intercept is first column of Phi
n, p = Phi.shape
w = np.zeros(p)

XtX = np.dot(Phi.T, Phi)
# largest eigenvalue of (X^T X)/n
L = np.max(np.linalg.eigvalsh(XtX)) / n
lr = 1.0 / (1.05 * L) 

epochs = 20000
tol = 1e-6

for t in range(epochs):
    grad = np.dot(Phi.T, (np.dot(Phi, w) - y)) / n 
    w -= lr * grad
    if np.linalg.norm(grad) < tol:
        break

y_pred_gd = np.dot(Phi, w)
sse_gd    = np.sum((y - y_pred_gd)**2)

print("== GRADIENT DESCENT ==")
print("Weights (w):", w)
print("SSE (GD):", sse_gd)
print("iters:", t+1, " | grad norm:", np.linalg.norm(grad))
print("lr used:", lr)

== GRADIENT DESCENT ==
Weights (w): [23.67265688  2.70929964  4.00849588 11.48051114  2.93305036]
SSE (GD): 1470.4637230947985
iters: 225  | grad norm: 9.574103810232422e-07
lr used: 0.4725170078034517


##### Protein Prediction 1 (d)

In [6]:
# Closed-Form
XtX = np.dot(Phi.T, Phi)
Xty = np.dot(Phi.T, y)

# full-rank now that we dropped one dummy and added one intercept
w_closed = np.linalg.solve(XtX, Xty) 

y_pred_cf = np.dot(Phi, w_closed)
sse_cf    = np.sum((y - y_pred_cf)**2)

print("== CLOSED-FORM ==")
print("Weights (w_closed):", w_closed)

== CLOSED-FORM ==
Weights (w_closed): [23.67265959  2.70930022  4.00849354 11.4805117   2.93304414]


##### Protein Prediction 1 (e)

In [7]:
print("MSE (GD):", sse_gd / n, " | MSE (CF):", sse_cf / n)

MSE (GD): 43.24893303219996  | MSE (CF): 43.248933032193456


In [8]:
print("SSE (GD):", sse_gd)
print("SSE (CF):", sse_cf)

# Comparison between the two
rel_diff = abs(sse_cf - sse_gd) / max(sse_cf, sse_gd)
print("Relative SSE diff (GD vs CF):", rel_diff)


SSE (GD): 1470.4637230947985
SSE (CF): 1470.4637230945775
Relative SSE diff (GD vs CF): 1.5029762996511105e-13


##### Protein Prediction II (a and b)

In [9]:
# Load again
df = pd.read_csv("recipes_df.csv")

# Target (NumPy) and raw features (DataFrame) again
y = df["Protein"].to_numpy()
raw_X = df.drop(columns = ["Protein"])

# One-hot encode Recipe; keep one dummy to avoid multicollinearity again
raw_X["Recipe"] = pd.Categorical(raw_X["Recipe"], categories = ["macncheese", "pizza"])
raw_X = pd.get_dummies(raw_X, columns = ["Recipe"], drop_first = True, dtype = int)

# Scale only the numeric columns
num_cols = ["Sodium", "Calcium", "Potassium"]
scaler = StandardScaler()
raw_X[num_cols] = scaler.fit_transform(raw_X[num_cols])

# Add intercept as first column
raw_X.insert(0, "Intercept", 1)

# Build Φ² 
# degree-2 on numeric features
S = raw_X["Sodium"].to_numpy()
C = raw_X["Calcium"].to_numpy()
K = raw_X["Potassium"].to_numpy()
D = raw_X["Recipe_pizza"].to_numpy()
ones = raw_X["Intercept"].to_numpy()

Phi2 = np.column_stack([
    ones,              
    S, C, K, D,        
    S**2, C**2, K**2,  
    S*C, S*K, C*K      
])

print("Phi2 shape:", Phi2.shape)  
print("y shape:", y.shape)

Phi2 shape: (34, 11)
y shape: (34,)


##### Protein Prediction II (C)

In [10]:
n2, p2 = Phi2.shape
w2 = np.zeros(p2)

# Safe step size: 1 / (1.05 * lambda_max((X'X)/n))
XtX2 = np.dot(Phi2.T, Phi2)
L2   = np.max(np.linalg.eigvalsh(XtX2)) / n2
lr2  = 1.0 / (1.05 * L2)

epochs2, tol2 = 40000, 1e-6
for t in range(epochs2):
    grad2 = np.dot(Phi2.T, (np.dot(Phi2, w2) - y)) / n2
    w2   -= lr2 * grad2
    if np.linalg.norm(grad2) < tol2:
        break

y_pred_gd2 = np.dot(Phi2, w2)
sse_gd2    = np.sum((y - y_pred_gd2)**2)

print("== GD (Polynomial Regression) ==")
print("w2:", w2)
print("SSE (GD, poly):", sse_gd2)


== GD (Polynomial Regression) ==
w2: [ 20.48245531   2.21524119   2.82878308   5.08324322  -4.02164366
   4.42582876   0.67845837   9.63720087  -2.81330081 -13.37162258
   0.91309558]
SSE (GD, poly): 807.7145613775615


##### Protein Prediction II (d)

In [11]:
Xty2 = np.dot(Phi2.T, y)

if np.linalg.matrix_rank(XtX2) == p2:
    w2_closed = np.linalg.solve(XtX2, Xty2)
else:
    w2_closed = np.dot(np.linalg.pinv(XtX2), Xty2)

y_pred_cf2 = np.dot(Phi2, w2_closed)
sse_cf2    = np.sum((y - y_pred_cf2)**2)

print("== Closed-Form (Polynomial Regression) ==")
print("w2_closed:", w2_closed)
print("SSE (CF, poly):", sse_cf2)


== Closed-Form (Polynomial Regression) ==
w2_closed: [ 20.48245313   2.21524119   2.8287748    5.08324402  -4.0216564
   4.42584462   0.67845829   9.63721214  -2.8132791  -13.37166524
   0.91309521]
SSE (CF, poly): 807.7145613757315


##### Protein Prediction II (e)

In [12]:
rel_diff2 = abs(sse_cf2 - sse_gd2) / max(sse_cf2, sse_gd2)
print("MSE (GD, Polynomial Regression):", sse_gd2 / n2, " | MSE (CF, Polynomial Regression):", sse_cf2 / n2)
print("Relative SSE diff (GD vs CF):", rel_diff2)

MSE (GD, Polynomial Regression): 23.75631062875181  | MSE (CF, Polynomial Regression): 23.756310628697985
Relative SSE diff (GD vs CF): 2.265672942287619e-12
