In [79]:
##Model Training

Model training

In [80]:
import pandas as pd
df = pd.read_csv('gemstone.csv')

In [81]:
#df = df.drop(labels='id',axis=1)

In [82]:
## independent and dependent features
x = df.drop(labels='id',axis=1)
y = df[['price']]

In [83]:
x.clarity.unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [84]:
# define which columns should be ordinal encoded
categorical_cols = x.select_dtypes(include='object').columns
numerical_cols = y.select_dtypes(exclude='object').columns

In [85]:
# define the custom ranking for each ordinal variable
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
claritY_categories = ['I1','SI2','SI1','VS2','VS1','VVS2', 'VVS1','IF']

In [86]:
from sklearn.impute import SimpleImputer ##Handling Missing values
from sklearn.preprocessing import StandardScaler #Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal Encoding --- categorical features

In [87]:
### pipelines is just for combining multiple files
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [88]:
### numerical pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
   
    ]

    
    
)

# categorical Pipeline
cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,claritY_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [89]:
# train test split

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

In [90]:
preprocessor.fit_transform(x_train)

array([[-0.69195358,  0.87410007, -0.93674681,  1.35074594],
       [ 0.85164501, -1.13764403,  0.91085333,  0.68445511],
       [ 2.17250211, -0.13177198,  0.91085333,  0.01816428],
       ...,
       [ 0.79991949, -0.13177198,  0.29498662,  0.01816428],
       [-0.78228764, -1.13764403,  0.29498662,  2.01703677],
       [-0.88326378, -1.13764403,  0.29498662, -1.31441737]])

In [91]:
# x_train= pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_params_out())## gettog error while pulling column name
x_train= pd.DataFrame(preprocessor.fit_transform(x_train))
x_test = pd.DataFrame(preprocessor.transform(x_test)) ### data leakage will happen if we give fit in test dataset

In [92]:
x_train.head()

Unnamed: 0,0,1,2,3
0,-0.691954,0.8741,-0.936747,1.350746
1,0.851645,-1.137644,0.910853,0.684455
2,2.172502,-0.131772,0.910853,0.018164
3,-0.7318,0.8741,-0.32088,2.017037
4,-0.874107,-1.137644,1.52672,-0.648127


In [95]:
# Model training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [96]:
regression = LinearRegression()
regression.fit(x_train,y_train)

LinearRegression()

In [97]:
regression.coef_

array([[ 4.04055821e+03,  7.98062063e-13,  9.41509043e-13,
        -3.32305345e-13]])

In [98]:
regression.intercept_

array([3976.8787389])

In [99]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_squared = r2_score(true,predicted)
    return mae, rmse, r2_squared

In [109]:
### train multiple models

models ={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()

}
    


model_list = []
r2_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    # make prediction
    y_pred = model.predict(x_test)

    evaluate_model(y_test,y_pred)

    mae, rmse, r2_squared=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model Training Performance')
    print('RMsE:',rmse)
    print("R2 score",r2_squared*100)

    r2_list.append(r2_squared)

    print('*'*35)
    print('\n')

LinearRegression
Model Training Performance
RMsE: 6.400461278177252e-12
R2 score 100.0
***********************************


Lasso
Model Training Performance
RMsE: 0.9948866766758573
R2 score 99.99999387459123
***********************************


Ridge
Model Training Performance
RMsE: 0.03110081724823071
R2 score 99.99999999401408
***********************************


ElasticNet
Model Training Performance
RMsE: 1339.5160395427893
R2 score 88.89589185592791
***********************************


