In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
# Load the data
data = pd.read_csv(r"C:\Firstendtoendproject\notebooks\data\GemstonePricePrediction.csv")
data.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
# Split the data into features (X) and target (y)
X = data.drop(labels=["price"], axis=1)
y = data[["price"]]

In [4]:
# Get the unique values for each categorical column
unique_carat_values = X['carat'].unique()
unique_cut_values = X['cut'].unique()
unique_color_values = X['color'].unique()
unique_clarity_values = X['clarity'].unique()

In [5]:
# Define the categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns
numerical_col = X.select_dtypes(exclude='object').columns

In [6]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','WS2','WS1','IF', 'VVS1', 'VVS2']

In [7]:
# Create pipelines for numerical and categorical columns
num_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

In [8]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))
])

In [9]:
# Create a ColumnTransformer to combine the pipelines
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_col),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=30)

In [11]:
# Fit the preprocessor to the training data and transform both the training and testing data
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [12]:
# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
    }

In [13]:
# Define a function to evaluate the models
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [14]:
# Train and evaluate each model
trained_model_list = []
model_list = []
r2_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)

     # Calculate R2 score
    r2 = r2_score(y_test, y_pred)


    # Calculate accuracy score (for regression models, this is not applicable, so we'll use R2 score as a proxy)
    accuracy = r2

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print('R2 score', r2_square * 100)
    print('Accuracy score', accuracy * 100)

    r2_list.append(r2_square)
    accuracy_list.append(accuracy)

    trained_model_list.append(model)

    print('=' * 35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 1230.5428399867646
MAE: 834.5657751536753
R2 score 90.68727752593722
Accuracy score 90.68727752593722


Lasso
Model Training Performance
RMSE: 1231.8029968888798
MAE: 836.5567165966868
R2 score 90.6681940774303
Accuracy score 90.6681940774303


Ridge
Model Training Performance
RMSE: 1230.7316884496001
MAE: 834.8716560443316
R2 score 90.68441890416784
Accuracy score 90.68441890416784


ElasticNet
Model Training Performance
RMSE: 1623.8448986685437
MAE: 1091.7042767422076
R2 score 83.7829343099173
Accuracy score 83.7829343099173




In [15]:
trained_model_list

[LinearRegression(), Lasso(), Ridge(), ElasticNet()]

In [16]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [17]:
r2_list


[0.9068727752593722, 0.906681940774303, 0.9068441890416784, 0.8378293430991729]

In [18]:
accuracy_list

[0.9068727752593722, 0.906681940774303, 0.9068441890416784, 0.8378293430991729]