In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import MEstimateEncoder
from sklearn.preprocessing import MinMaxScaler
import regression as reg
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import Lasso,Ridge
from sklearn.model_selection import GridSearchCV


In [None]:
df = pd.read_csv('data/cleaned_cars.csv')
df = pd.get_dummies(df, columns = ['country'], drop_first=True, dtype=np.int8)
df.head()

## Target encoding

In [None]:
X = df.copy()
y = X.pop('price')

X_encode = X.sample(frac=0.2, random_state=42)
y_encode = y[X_encode.index]
X = X.drop(X_encode.index)
y = y[X.index]

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["brand"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X = encoder.transform(X)
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)  

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# Split the dataset into training (60%) and temporary (40%) sets with a fixed random_state
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the temporary set into validation (50% of temporary, i.e., 20% of total) and test (50% of temporary, i.e., 20% of total) sets with the same random_state
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [None]:
# Print the sizes of the resulting datasets
print(f"Training set size: {(X_train_scaled.shape[0])}")
print(f"Validation set size: {(X_val_scaled.shape[0])}")
print(f"Test set size: {(X_test_scaled.shape[0])}")

# Optionally, print the datasets
print("\nTraining Set:")
X_train_scaled
df_ok = pd.DataFrame(X_train_scaled, columns=X_train.columns)
df_ok.head()    

## No API part

In [None]:
w = reg.linear_regression_fit(X_train_scaled, y_train)
y_predict = reg.linear_regression_predict(X_test, w)


In [None]:
# Calculate mean squared error
mse = mean_squared_error(y_predict, y_test)
print(f"Mean Squared Error: {mse}")

In [None]:
num_iterations = 1000
w, errors = reg.gradient_descent(X_train_scaled, y_train, 0.01, num_iterations)
print(w.shape)
y_predict = reg.linear_regression_predict(X_test_scaled, w)
mean_squared_error(y_predict, y_test)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(num_iterations), errors)
plt.xlabel('Iteration')
plt.ylabel('Error')
plt.title('Gradient Descent Error')
plt.legend()
plt.show()

## Lasso Implementation(API)

In [None]:
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)

In [None]:
y_val_pred = lasso.predict(X_val_scaled)
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")


In [None]:
# Hyperparamater Values
param_grid = {
    'alpha': [0.0001,0.001,0.01,0.1,1,10,100]
}

In [None]:
#Using Grid Search to find the optimal value for alpha

lasso_cv = GridSearchCV(lasso,param_grid,cv = 5,n_jobs = -1)
lasso_cv.fit(X_train_scaled, y_train)

In [None]:
#Testing Lasso on the validation set
y_val_pred = lasso_cv.predict(X_val_scaled)
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred) 
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

## Ridge Implementation (API)

In [None]:
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)

In [None]:
#Using Grid Search to find the optimal value for alpha

ridge_cv = GridSearchCV(ridge,param_grid,cv = 5,n_jobs = -1)
ridge_cv.fit(X_train_scaled, y_train)

In [None]:
#Testing Ridge on the validation set
y_val_pred = ridge_cv.predict(X_val_scaled)
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred) 
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")