## Import and clean data  

In [None]:
from util.clean_data import clean_data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Ridge

df = pd.read_csv('data/diamond.csv')

# performs interger encoding and log transforms carat and price.
df = clean_data(df)

Y_price = df['price'].values

df = df.drop(["carat", "price"], axis=1)

df = (df - df.mean() ) / df.std(ddof=1)


df.describe()

In [None]:
# With log price
Y = df['log_price'].values

# with price
#Y = Y_price

feature_names = df.columns

df = df.drop('log_price', axis=1)

X = df.values

## Find the best regulzeration paramaetetr

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

reg_tries = [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 10, 100, 1000, 5000, 10000]
results = []

for reg_value in reg_tries: 
    ridge_model = Ridge(alpha=reg_value)
    
    scores = cross_val_score(ridge_model, X_train, Y_train, cv=10, scoring="neg_root_mean_squared_error")
    
    mean_rmse = -scores.mean()  # Convert negative RMSE back to positive
    results.append((reg_value, mean_rmse))
    print(f"Regularization: {reg_value:<10} Cross-Validation RMSE: {mean_rmse:.8f}")

reg_values, rmse_values = zip(*results)

# optimal reg model # with lowest RSME
opt_reg = reg_values[rmse_values.index(min(rmse_values))]





# train model with all features and optimal reg we found
model = Ridge(alpha=opt_reg)
model.fit(X_train, Y_train)


y_pred = model.predict(X_test)

# Calculate RMSE for the test set
test_rmse_all_features = np.sqrt(mean_squared_error(Y_test, y_pred))

print("\nFinal model :")
print(f"Test RMSE: {test_rmse_all_features:.8f}")

In [None]:
# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(reg_values, rmse_values, marker='o', linestyle='-', color='b')
plt.xscale('log')
plt.xlabel('Regularization Strength (alpha)')
plt.ylabel('Cross-Validation RMSE')
plt.title('Effect of Regularization on RMSE')
plt.grid(True)
plt.show()    


# Feature selection:

Backwards selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

# Backward selection: Start with all features and remove one at a time
backward_selector = SequentialFeatureSelector(
    Ridge(alpha=opt_reg),  # model with optimal reg.
    n_features_to_select="auto",
    direction="backward", 
    cv=10, 
    scoring="neg_root_mean_squared_error"
)
backward_selector.fit(X_train, Y_train)

# Get selected feature indices
selected_backward_features = backward_selector.get_support(indices=True)
print(f"Selected Features (Backward): {selected_backward_features}")


X_train_backward = X_train[:, selected_backward_features]

# backward model evaluate
backward_score = cross_val_score(Ridge(alpha=opt_reg), X_train_backward, Y_train, cv=10, scoring="neg_root_mean_squared_error")

print("Backward score:", -backward_score.mean())


Forward Selection:

In [None]:
# Forward selection: Start with no features and add one at a time
forward_selector = SequentialFeatureSelector(
    Ridge(alpha=opt_reg),  # model with optimal reg.
    n_features_to_select="auto",
    direction="forward", 
    cv=10, 
    scoring="neg_root_mean_squared_error"
)
forward_selector.fit(X_train, Y_train)

# Get selected feature indices
selected_forward_features = forward_selector.get_support(indices=True)
print(f"Selected Features (Forward): {selected_forward_features}")

X_train_forward = X_train[:, selected_forward_features]

# Forward model evaluation
forward_score = cross_val_score(Ridge(alpha=opt_reg), X_train_forward, Y_train, cv=10, scoring="neg_root_mean_squared_error")

print("Forward score:", -forward_score.mean())


# Final model

In [None]:
# Train forward model
model_forward = Ridge(alpha=opt_reg)
model_forward.fit(X_train_forward, Y_train)
# Predict and calculate RMSE for forward model
y_pred_forward = model_forward.predict(X_test[:, selected_forward_features])
test_rmse_forward = np.sqrt(mean_squared_error(Y_test, y_pred_forward))

# Train backward model
model_backward = Ridge(alpha=opt_reg)
model_backward.fit(X_train_backward, Y_train)
# Predict and calculate RMSE for backward model
y_pred_backward = model_backward.predict(X_test[:, selected_backward_features])
test_rmse_backward = np.sqrt(mean_squared_error(Y_test, y_pred_backward))


# Calculate R^2 values
r2_all_features = model.score(X_test, Y_test)
r2_forward = model_forward.score(X_test[:, selected_forward_features], Y_test)
r2_backward = model_backward.score(X_test[:, selected_backward_features], Y_test)

# Print R^2 values
print(f"R^2 All features          : {r2_all_features:.8f}")
print(f"R^2 Forward model         : {r2_forward:.8f}")
print(f"R^2 Backward model        : {r2_backward:.8f}")

# Print results
print(f"\nTest RMSE All features          : {test_rmse_all_features:.8f}")
print(f"Test RMSE Forward model         : {test_rmse_forward:.8f}")
print(f"Test RMSE Backward model        : {test_rmse_backward:.8f}")





In [None]:
print("\nForward Model Feature Weights:")
# print feature weight for all features model
weights = model.coef_
for name, weight in zip(feature_names, weights):
    print(f"Feature: {name:<10} Weight: {weight:.8f}")


# Print feature weights for forward model
print("\nForward Model Feature Weights:")
forward_weights = model_forward.coef_
for name, weight in zip(feature_names[selected_forward_features], forward_weights):
    print(f"Feature: {name:<10} Weight: {weight:.8f}")

# Print feature weights for backward model
print("\nBackward Model Feature Weights:")
backward_weights = model_backward.coef_
for name, weight in zip(feature_names[selected_backward_features], backward_weights):
    print(f"Feature: {name:<10} Weight: {weight:.8f}")