In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [3]:
df = pd.read_csv('autopricecleaned.csv')

#Separate features and target

X = df.drop(['price'], axis =1, inplace = False)
y = df['price']

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6, random_state = 30)

#Initialize untrain model with default parameter settings
model = SGDRegressor(random_state = 30)

#Train the model
model.fit(X_train, y_train)

In [4]:
#Score the model
y_pred = model.predict (X_test)

#Evaluate the model

#r-squared indicates the proportion  of the variance in the dependant variable that is
# predictable from the independant variables in a regression model

mae = metrics. mean_absolute_error(y_test, y_pred)
mse = metrics. mean_squared_error (y_test, y_pred)
rmse = mse ** 0.5
r2 = metrics.r2_score(y_test, y_pred)

#Calculate Adjusted R-squared
n= len(y_test) # Number of observations
p = X_test.shape[1] # Number of predictors
adjusted_r2 = 1-(1-r2) * (n-1)/ (n - p - 1)

print(f"n is {n} and p is {p}.")

#print out the metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squsred Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared : {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")

n is 116 and p is 64.
Mean Absolute Error (MAE): 3.873782100672409e+16
Mean Squsred Error (MSE): 1.5075234865017117e+33
Root Mean Squared Error (RMSE): 3.882683976969683e+16
R-squared : -3.4314368210939965e+25
Adjusted R-squared: -7.737553616192344e+25


In [6]:
# Initialize untrain model with default parameters and standardize the features when doing the model training
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), SGDRegressor(random_state =30))

In [7]:
pipeline.fit(X_train, y_train)

In [8]:
y_pred = pipeline.predict(X_test)

In [9]:
#r-squared indicates the proportion  of the variance in the dependant variable that is
# predictable from the independant variables in a regression model

mae = metrics. mean_absolute_error(y_test, y_pred)
mse = metrics. mean_squared_error (y_test, y_pred)
rmse = mse ** 0.5
r2 = metrics.r2_score(y_test, y_pred)

#Calculate Adjusted R-squared
n= len(y_test) # Number of observations
p = X_test.shape[1] # Number of predictors
adjusted_r2 = 1-(1-r2) * (n-1)/ (n - p - 1)

print(f"n is {n} and p is {p}.")

#print out the metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squsred Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared : {r2}")
print(f"Adjusted R-squared: {adjusted_r2}")

n is 116 and p is 64.
Mean Absolute Error (MAE): 2143.1713861166304
Mean Squsred Error (MSE): 7795375.86255046
Root Mean Squared Error (RMSE): 2792.020032619834
R-squared : 0.8225610412803831
Adjusted R-squared: 0.5998925440636089


In [10]:
#in-sample performance
y_train_pred = pipeline.predict(X_train)

In [12]:
#r-squared indicates the proportion  of the variance in the dependant variable that is
# predictable from the independant variables in a regression model

mae = metrics. mean_absolute_error(y_train, y_train_pred)
mse = metrics. mean_squared_error (y_train, y_train_pred)
rmse = mse ** 0.5
r2 = metrics.r2_score(y_train, y_train_pred)

#Calculate Adjusted R-squared
# n= len(y_test) # Number of observations
# p= X_test.shape[1] # Number  of predictors
adjusted_r2 = 1-(1-r2) * (n-1)/ (n - p - 1)

print(f"n is {n} and p is {p}.")

#print out the metrics
print(f"In-Sample Mean Absolute Error (MAE): {mae}")
print(f"In-Sample Mean Squsred Error (MSE): {mse}")
print(f"In-Sample Root Mean Squared Error (RMSE): {rmse}")
print(f"In-Sample R-squared : {r2}")
print(f"In-Sample Adjusted R-squared: {adjusted_r2}")

n is 116 and p is 64.
In-Sample Mean Absolute Error (MAE): 933.373232928378
In-Sample Mean Squsred Error (MSE): 1909528.8209273634
In-Sample Root Mean Squared Error (RMSE): 1381.8570189883478
In-Sample R-squared : 0.9794455964500691
In-Sample Adjusted R-squared: 0.9536518351325088


In [13]:
# Define the parameter grid  for GridSearchCV
param_grid = {
    'sgdregressor__alpha': [0.0001, 0.001, 0.01, 0.1], #Regularization strength
    'sgdregressor__learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'sgdregressor__eta0' : [0.001, 0.01, 0.1], # Initial Learning rate
    'sgdregressor__max_iter' : [800, 900, 1000] #Maximum iterations
}

In [17]:
# Create a pipeline  that scales data and  applies  SGDRegressor
pipeline_tune = make_pipeline(StandardScaler(), SGDRegressor(random_state = 42))

#Initialize  GridSearch CV with 5-fold cross-validation
grid_search = GridSearchCV(estimator= pipeline_tune, param_grid= param_grid, cv=5, scoring= 'neg_mean_squared_error', verbose=1)

In [18]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits




In [19]:
#print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)

#Get the best model

best_SGDmodel = grid_search.best_estimator_

#Score the model with training set
y_tune_pred = best_SGDmodel.predict(X_test)

Best Parameters: {'sgdregressor__alpha': 0.1, 'sgdregressor__eta0': 0.001, 'sgdregressor__learning_rate': 'invscaling', 'sgdregressor__max_iter': 800}


In [21]:
#Calculate performance metrics
#r-squared indicates the proportion  of the variance in the dependant variable that is
# predictable from the independant variables in a regression model

mae = metrics. mean_absolute_error(y_test, y_tune_pred)
mse = metrics. mean_squared_error (y_test, y_tune_pred)
rmse = mse ** 0.5
r2 = metrics.r2_score(y_test, y_tune_pred)

#Calculate Adjusted R-squared
#n= len(y_testset) # Number of observations
#p = X_testset.shape[1] # Number of predictors
adjusted_r2 = 1-(1-r2) * (n-1)/ (n - p - 1)

#print(f"n is {n} and p is {p}.")

#print out the metrics
print(f"Tuned Mean Absolute Error (MAE): {mae}")
print(f"Tuned Mean Squsred Error (MSE): {mse}")
print(f"Tuned Root Mean Squared Error (RMSE): {rmse}")
print(f"Tuned R-squared : {r2}")
print(f"Tuned Adjusted R-squared: {adjusted_r2}")

Tuned Mean Absolute Error (MAE): 2066.117620709363
Tuned Mean Squsred Error (MSE): 6438317.129932971
Tuned Root Mean Squared Error (RMSE): 2537.383914572836
Tuned R-squared : 0.8534505189249194
Tuned Adjusted R-squared: 0.669545287771877
