In [1]:
import pandas as pd

## Model Training

In [3]:
df=pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df.drop(labels=['id'],axis=1,inplace=True)

In [10]:
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [11]:
Y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [9]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [12]:
# Segregating numerical and categorical columns
numerical_col=X.select_dtypes(exclude='object').columns
categorical_col=X.select_dtypes(include='object').columns


In [13]:
# Define custom ranking for each ordinal variables

cut_categories=['Fair','Good','Very Good','Premium', 'Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
#Numerical Pipeline
numerical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="median")),
    ('scaler',StandardScaler())
])

#Categorical Pipeline
categorical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="most_frequent")),
    ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
])

preprocessor=ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_col),
    ('categorical_pipeline',categorical_pipeline,categorical_col)
])

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.33,random_state=30)

In [21]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [23]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-1.190903,0.538078,-0.121487,-1.573815,-1.57007,-1.527658,0.873771,-0.934593,1.352685
1,0.904529,-0.479492,-0.642431,1.03915,1.04146,0.97729,0.873771,0.297424,2.019062
2,-1.039686,-2.144607,0.399457,-1.195386,-1.189222,-1.339424,-2.143965,-0.318584,0.686308
3,-0.845265,-0.664505,1.441345,-0.880028,-0.880916,-0.91952,-0.132141,0.297424,0.019931
4,1.250167,-1.127037,1.441345,1.282427,1.340698,1.165523,-0.132141,2.14545,-0.646446


In [24]:
# Model Training

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [25]:
reg=LinearRegression()
reg.fit(X_train,y_train)

In [26]:
reg.coef_

array([[ 6432.37246544,  -133.37085491,   -71.5594995 , -1712.75940631,
         -509.03110192,   -56.40910864,    72.29448314,  -459.25648103,
          650.78049561]])

In [27]:
reg.intercept_

array([3971.54247338])

In [29]:
import numpy as np
def evaluate_model(true,predict):
    mae=mean_absolute_error(true,predict)
    mse=mean_squared_error(true,predict)
    rmse=np.sqrt(mean_squared_error(true,predict))
    r2_square=r2_score(true,predict)

    return mae,mse,r2_square

In [35]:
models={'linear':LinearRegression(),
        'lasso':Lasso(),
        'ridge':Ridge(),
        'elasticnet':ElasticNet()
        }

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_predict=model.predict(X_test)
    mae,rmse,r2=evaluate_model(y_test,y_predict)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance:')
    print('MAE',mae)
    print('r2_score',r2*100)
    print('RMSE',rmse)


    r2_list.append(r2_list)

    print("=="*35)
    print('\n')

linear
Model Training Performance:
MAE 673.1192939264242
r2_score 93.69455849250947
RMSE 1024282.5674097423


lasso
Model Training Performance:
MAE 674.1819437208048
r2_score 93.69449114520184
RMSE 1024293.5075910259


ridge
Model Training Performance:
MAE 673.1514706239266
r2_score 93.69452971111568
RMSE 1024287.2427809915


elasticnet
Model Training Performance:
MAE 1059.8546884824789
r2_score 85.56339858023905
RMSE 2345142.548596719




In [38]:
from lazypredict.Supervised import LazyRegressor

In [1]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
print(models)

NameError: name 'LazyRegressor' is not defined