In [2]:
import pandas as pd
!pip install scikit-learn==1.4.2



## Model Training

In [4]:
df = pd.read_csv("./data/gemstone.csv")

In [5]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
df = df.drop(labels=["id"], axis=1)

In [7]:
## Independent and dependent feature
X=df.drop(labels=["price"], axis=1)
Y=df[["price"]]

In [8]:
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [9]:
## Define which column should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

In [10]:
## Define the custom ranking for each ordinal value

cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', "IF"]

In [11]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [13]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [14]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
168192,0.34,Ideal,I,VVS2,60.9,57.0,4.56,4.53,2.76
35202,0.90,Good,E,SI1,63.8,57.0,6.07,6.03,3.87
41091,1.02,Premium,G,VS1,62.7,58.0,6.35,6.39,4.00
31239,0.32,Premium,G,VS2,62.1,59.0,4.37,4.35,2.71
45722,0.35,Ideal,J,VVS2,61.1,56.0,4.53,4.57,2.78
...,...,...,...,...,...,...,...,...,...
66455,0.31,Ideal,E,SI1,61.8,56.0,4.31,4.35,2.68
46220,1.25,Ideal,G,SI2,62.0,56.0,6.88,6.95,4.28
98804,1.00,Good,G,SI1,63.5,56.0,6.29,6.37,4.02
48045,1.10,Ideal,G,VS1,59.9,60.0,6.68,6.77,4.01


In [27]:
# Assuming you have already fitted your ColumnTransformer named 'preprocessor'
# and stored it in a variable 'preprocessor'

# Get the column names of the numerical features
num_feature_names = preprocessor.transformers[0][2]

# Get the column names of the categorical features after encoding
cat_feature_names = preprocessor.transformers[1][2]
feature_names = num_feature_names.tolist() + cat_feature_names.tolist()
print(feature_names)

X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=feature_names)
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=feature_names)


['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']


In [29]:
X_train

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.080970,-1.123150,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.399800,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.244270,-1.195605,-0.132136,0.296826,0.019720
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.144670,1.352731
...,...,...,...,...,...,...,...,...,...
135496,-1.040295,-0.016876,-0.642862,-1.268122,-1.244270,-1.239078,0.874076,-0.935071,-0.646786
135497,0.991842,0.168176,-0.642862,1.048629,1.114501,1.079486,0.874076,0.296826,-1.313292
135498,0.451380,1.556060,-0.642862,0.516768,0.588314,0.702719,-2.144558,0.296826,-0.646786
135499,0.667565,-1.774863,1.442462,0.868337,0.951202,0.688228,0.874076,0.296826,0.686225


In [31]:
X_test

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-0.564688,-0.942132,-0.642862,-0.429765,-0.464061,-0.500036,-0.132136,-0.935071,0.019720
1,-0.175556,1.000906,-0.121531,-0.042137,-0.028595,0.036132,-1.138347,0.912774,-0.646786
2,-1.061913,0.260701,-0.121531,-1.304180,-1.298703,-1.268060,0.874076,0.912774,2.685743
3,0.970223,-0.201927,1.963794,1.048629,0.996563,0.978049,-0.132136,0.296826,0.019720
4,-0.932202,-1.312235,0.399800,-1.006699,-0.990248,-1.065186,-0.132136,-0.935071,0.686225
...,...,...,...,...,...,...,...,...,...
58067,1.013460,1.185958,-0.642862,1.003556,1.041924,1.151941,-1.138347,0.912774,0.019720
58068,-0.997058,0.260701,-1.164193,-1.141917,-1.126331,-1.108659,0.874076,-0.319122,2.019237
58069,-0.197174,-3.347799,1.442462,0.102096,0.071199,-0.224706,-0.132136,2.144670,0.019720
58070,-0.824110,-0.201927,-0.121531,-0.853450,-0.881382,-0.876803,0.874076,0.296826,-0.646786


In [51]:
## Model Training
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [53]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [55]:
regression.coef_

array([[ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
         -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
          650.76431652]])

In [57]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [59]:
## Train multiple models

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # Make prediction
    y_pred = model.predict(X_test)
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)
    
    print(list(models.values())[i])
    model_list.append(list(models.values())[i])
    
    print("Model Training Performance")
    print("rmse: ", rmse)   
    print("mae: ", mae)    
    print("r2_score: ", r2_square)
    
    r2_list.append(r2_square)
    
    print("="*35)
    print("\n")
    

LinearRegression()
Model Training Performance
rmse:  1013.9047094344003
mae:  674.0255115796848
r2_score:  0.9368908248567511


Lasso()
Model Training Performance
rmse:  1013.8784226767013
mae:  675.071692336216
r2_score:  0.9368940971841704


Ridge()
Model Training Performance
rmse:  1013.9059272771556
mae:  674.0555800798325
r2_score:  0.9368906732505949


ElasticNet()
Model Training Performance
rmse:  1533.4162456064046
mae:  1060.7368759154729
r2_score:  0.8556494831165182


