In [38]:
import pandas as pd

In [39]:
df = pd.read_csv("data/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [40]:
## Independent and Depenent Features
X = df.drop(labels=['price'], axis=1)
Y = df[['price']]
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
49995,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
49996,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
49997,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
49998,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [41]:
Y

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335
...,...
49995,2757
49996,2757
49997,2757
49998,2757


In [42]:
## Categorise columns - Define which column should be ordinal 
## and which should be scaled

numerical_columns = X.select_dtypes(exclude='object').columns
categocal_columns = X.select_dtypes(include='object').columns
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [43]:
# Define the custom ranking for each ordinal variable.

cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [44]:
from sklearn.impute import SimpleImputer # Handling missing values
from sklearn.preprocessing import StandardScaler # Handling feature scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal encoding
# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [45]:
## Numerical Pipeline

num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)


# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)


preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipelines', cat_pipeline, categocal_columns)
])


In [46]:
## Train Test split

from sklearn.model_selection import train_test_split

X_train, X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.30, random_state=30)

In [47]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [48]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipelines__cut,cat_pipelines__color,cat_pipelines__clarity
0,0.427486,-0.734707,2.026689,0.534705,0.653911,0.480201,-1.695471,-0.94028,-0.030036
1,0.448607,1.422875,0.688367,0.525783,0.501056,0.677628,-1.695471,-0.350678,-0.637341
2,-1.029824,-0.595509,1.134474,-1.205024,-1.234299,-1.240233,0.090041,2.007729,-0.637341
3,-0.924222,0.170085,-1.096063,-1.02659,-1.018504,-0.972297,0.982797,-0.94028,0.577269
4,-0.966463,0.587682,-0.649956,-1.151494,-1.135393,-1.056908,0.982797,1.418127,0.577269


In [54]:
# Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [50]:
regression = LinearRegression()
regression.fit(X_train, Y_train)

In [51]:
regression.coef_

array([[ 5120.67334547,  -107.43282725,   -47.34203126, -1892.86436483,
          962.48826487,   -58.24021585,   142.75632976,  -547.76716432,
          815.97991893]])

In [52]:
regression.intercept_

array([3932.71168571])

In [53]:
import numpy as np

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [57]:
# Train multiple models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

trained_models_list = []
models_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    
    mae, rmse, r2_square = evaluate_model(Y_test, y_pred)
    
    print(list(models.keys())[i])
    models_list.append(list(models.keys())[i])
    
    print('Model Training Performance')
    
    print("Model training performance")
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 Score:", r2_square*100)
    
    r2_list.append(r2_square)
    print('='*40)
    print("\n")

LinearRegression
Model Training Performance
Model training performance
RMSE: 1300.294195035091
MAE: 815.2975483079103
R2 Score: 89.4998126489661


Lasso
Model Training Performance
Model training performance
RMSE: 1244.1114652053602
MAE: 812.805209284584
R2 Score: 90.38758781200978


Ridge
Model Training Performance
Model training performance
RMSE: 1297.3051187973483
MAE: 815.2759170818821
R2 Score: 89.5480321768205


ElasticNet
Model Training Performance
Model training performance
RMSE: 1647.7194780371813
MAE: 1084.031807333947
R2 Score: 83.13911555798865


