In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split
df=pd.read_csv(r"C:\Users\hp\Downloads\USA_Housing.csv")

In [5]:
X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)

In [6]:
scaler = StandardScaler()     
X_scaled = scaler.fit_transform(X)

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf 
r2_scores = []


In [8]:
for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled)):      #kf.split(X_scaled) generates 5 splits (since we used KFold(n_splits=5)).
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Add bias column of ones for intercept
    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    # Compute beta using Least Squares: β = (XᵀX)^(-1) Xᵀy
    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ (X_train_bias.T @ y_train)
     # Predictions
    y_pred = X_test_bias @ beta

    # R2 score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    print(f"Fold {fold+1}: R2 Score = {r2:.4f}")

    # Track best beta
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nAverage R2 Score across 5 folds:", np.mean(r2_scores))
print("Best R2 Score:", best_r2)

# Step e: Train on 70% using best beta and test on 30%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Train using best beta on 70% data
y_train_pred = X_train_bias @ best_beta
y_test_pred = X_test_bias @ best_beta

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nFinal Model Performance using Best Beta:")
print(f"Train R2 Score: {train_r2:.4f}")
print(f"Test R2 Score: {test_r2:.4f}")

Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244

Average R2 Score across 5 folds: 0.9175745431092714
Best R2 Score: 0.9243869413350316

Final Model Performance using Best Beta:
Train R2 Score: 0.9193
Test R2 Score: 0.9147


In [None]:
def gradient_descent(X,y,learning_rate,num_iterations):
    m=X.shape[0]
    n=X.shape[1]
    beta=np.zeros((n,1))

    for i in range(num_iterations):
        predictions= X @ beta
        error=predictions-y
        gradient=(1/m)*X.T @ error 
        beta= beta-learning_rate*gradient
    return beta
def calculate_r2(y_true,y_pred):
    ss_total=np.sum((y_true,np.mean(y_true))**2)
    ss_residual=np.sum((y_true-y_pred)**2)
    r2=1-(ss_residual/ss_total)
    return r2
learning_rates = [0.001, 0.01, 0.1, 1]
best_r2_val = -np.inf 
best_beta = None
best_lr = None
num_iterations = 1000

for lr in learning_rates:
    print(f'Training with learning rate: {lr}')
    beta_trained=gradient_descent(X_train,y_train,lr,num_iterations)
    y_val_pred=X_val @ beta_trained
    r2_val=r2_score(y_val,y_val_pred)
    print(f"R2 score on validation set: {r2_val}")
    if r2_val>best_r2_val:
        best_r2_val=r2_val
        best_beta=beta_trained
        best_lr=lr
    



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
url="https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

cols=["symboling", "normalized_losses",  
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",  
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",  
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",  
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
data=pd.read_csv(url, names=cols)
#df=pd.DataFrame(data)

for col in data.columns:
    if data[col].dtype == "object":
        data[col].fillna(data[col].mode()[0], inplace=True)
    else:
        data[col] = data[col].astype(float)
        data[col].fillna(data[col].mean(), inplace=True)

data.dropna(subset=["price"], inplace=True)
data["price"] = data["price"].astype(float)

num_map = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}
data["num_doors"].replace(num_map, inplace=True)
data["num_cylinders"].replace(num_map, inplace=True)

data = pd.get_dummies(data, columns=["body_style", "drive_wheels"], drop_first=True)

label_cols = ["make", "aspiration", "engine_location", "fuel_type"]
for col in label_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

data["fuel_system"] = data["fuel_system"].apply(lambda x: 1 if "pfi" in x else 0)
data["engine_type"] = data["engine_type"].apply(lambda x: 1 if "ohc" in x else 0)

X = data.drop("price", axis=1)
y = data["price"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("R2 Score (Without PCA):", r2_score(y_test, y_pred))

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.3, random_state=42)
lr.fit(X_train_r, y_train_r)
y_pred_r = lr.predict(X_test_r)
print("R2 Score (With PCA):", r2_score(y_test_r, y_pred_r))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

ValueError: could not convert string to float: '?'