In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

ques 1

In [None]:
from google.colab import files
uploaded = files.upload()
house_data = pd.read_csv('USA_Housing.csv')
X = house_data.drop(columns=["Price"]).values
y = house_data["Price"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf

fold = 1
for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    beta = np.linalg.inv(X_train.T @ X_train) @ (X_train.T @ y_train)

    y_pred = X_test @ beta
    r2 = r2_score(y_test, y_pred)

    print(f"Fold {fold} R2 Score: {r2:.4f}")
    fold += 1
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nBest R2 Score from CV:", best_r2)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
beta_final = np.linalg.inv(X_train.T @ X_train) @ (X_train.T @ y_train)

y_pred_final = X_test @ beta_final
print("Final R2 Score (70/30 split):", r2_score(y_test, y_pred_final))



Saving USA_Housing.csv to USA_Housing (2).csv
Fold 1 R2 Score: -11.4420
Fold 2 R2 Score: -11.8285
Fold 3 R2 Score: -11.6144
Fold 4 R2 Score: -10.9997
Fold 5 R2 Score: -10.5798

Best R2 Score from CV: -10.579758582041713
Final R2 Score (70/30 split): -12.003903385562284


ques 2

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

def gradient_descent(X, y, lr, iterations=1000):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iterations):
        gradient = -(2/m) * (X.T @ (y - X @ beta))
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
best_beta = None
best_val_r2 = -np.inf

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, lr=lr, iterations=1000)
    y_val_pred = X_val @ beta
    y_test_pred = X_test @ beta

    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    print(f"\nLearning Rate {lr}:")
    print("Validation R2:", r2_val)
    print("Test R2:", r2_test)

    if r2_val > best_val_r2:
        best_val_r2 = r2_val
        best_beta = beta

print("\nBest Beta (from validation) coefficients:\n", best_beta)


Learning Rate 0.001:
Validation R2: -11.309840188168787
Test R2: -12.05506738296265

Learning Rate 0.01:
Validation R2: -11.318069140536533
Test R2: -12.010979529759709

Learning Rate 0.1:
Validation R2: -11.31806970348623
Test R2: -12.010979232167713

Learning Rate 1:
Validation R2: -inf
Test R2: -inf

Best Beta (from validation) coefficients:
 [196011.02066971 138171.30668778 117004.69583016  39664.25921488
 116553.34299286]


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


ques 3

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

# 1 part
columns = [
    "symboling","normalized_losses","make","fuel_type","aspiration","num_doors",
    "body_style","drive_wheels","engine_location","wheel_base","length","width",
    "height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system",
    "bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg",
    "highway_mpg","price"
]

data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
    header=None, names=columns, na_values="?"
)

# 2 part
categorical = [
    "make","fuel_type","aspiration","num_doors","body_style",
    "drive_wheels","engine_location","engine_type","fuel_system","num_cylinders"
]
numerical = [c for c in columns if c not in categorical + ["price"]]

data[numerical] = data[numerical].apply(pd.to_numeric, errors="coerce")

data = data.dropna(subset=["price"])

for col in numerical:
    data[col] = data[col].fillna(data[col].mean())
for col in categorical:
    data[col] = data[col].fillna(data[col].mode()[0])

# Special encodings
door_map = {"two": 2, "four": 4}
cyl_map = {"two":2, "three":3, "four":4, "five":5, "six":6, "eight":8, "twelve":12}
data["num_doors"] = data["num_doors"].map(door_map)
data["num_cylinders"] = data["num_cylinders"].map(cyl_map)

data["fuel_system"] = data["fuel_system"].astype(str).str.contains("pfi").astype(int)
data["engine_type"] = data["engine_type"].astype(str).str.contains("ohc").astype(int)

# Label encoding using factorize
for col in ["make","aspiration","engine_location","fuel_type"]:
    data[col] = pd.factorize(data[col])[0]

# dummy encoding
data = pd.get_dummies(data, columns=["body_style","drive_wheels"], drop_first=True)

# Features and target
X = data.drop("price", axis=1).to_numpy()
y = data["price"].astype(float).to_numpy()

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5 part
n_samples = len(X_scaled)
train_size = int(0.7 * n_samples)
X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# 6 part
lr = LinearRegression().fit(X_train, y_train)
pred_base = lr.predict(X_test)
print("Baseline R²:", r2_score(y_test, pred_base))

# PCA + Linear Regression
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

lr_pca = LinearRegression().fit(X_train_pca, y_train)
pred_pca = lr_pca.predict(X_test_pca)
print("R² after PCA:", r2_score(y_test, pred_pca))
print("Did performance improve?", r2_score(y_test, pred_pca) > r2_score(y_test, pred_base))


Baseline R²: 0.18961326980816962
R² after PCA: 0.3415833571426846
Did performance improve? True
