In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("./data/train.csv")

In [3]:
data=data.drop(labels="id",axis=1)

In [4]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
x=data.drop(labels=["price"],axis=1)

In [6]:
y=data[["price"]]

In [7]:
x

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [8]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [9]:
categorical_cols=x.select_dtypes(include="object").columns

In [10]:
numerical_cols=x.select_dtypes(exclude='object').columns

In [11]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [12]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [13]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [14]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer()),
        ("scaler",StandardScaler())
    ]
)

In [16]:
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='most_frequent')),
        ("ordinalencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
)

In [17]:
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=25)

In [19]:
preprocessor.fit_transform(x_train)

array([[ 1.60331087, -0.5764124 ,  1.44824605, ...,  3.        ,
         2.        ,  4.        ],
       [ 0.30396158, -0.29901802, -0.11686541, ...,  4.        ,
         2.        ,  1.        ],
       [-0.88710859,  0.25577075, -0.11686541, ...,  4.        ,
         1.        ,  2.        ],
       ...,
       [-1.06035517,  0.81055951, -1.68197687, ...,  4.        ,
         4.        ,  7.        ],
       [-0.84379695, -0.6688772 , -0.63856923, ...,  4.        ,
         1.        ,  5.        ],
       [-0.84379695, -0.11408843, -0.63856923, ...,  2.        ,
         3.        ,  6.        ]])

In [20]:
preprocessor.transform(x_test)

array([[-0.9953877 ,  0.07084116, -0.11686541, ...,  4.        ,
         0.        ,  3.        ],
       [-0.93042024, -0.29901802, -0.63856923, ...,  4.        ,
         0.        ,  2.        ],
       [ 1.55999923, -0.76134199, -0.63856923, ...,  4.        ,
         3.        ,  3.        ],
       ...,
       [-1.01704352, -0.39148281, -0.63856923, ...,  4.        ,
         0.        ,  3.        ],
       [-0.45399217, -0.48394761, -0.11686541, ...,  4.        ,
         3.        ,  5.        ],
       [-0.80048531, -1.50106034,  0.92654223, ...,  3.        ,
         2.        ,  5.        ]])

In [21]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [22]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [23]:
x_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,1.603311,-0.576412,1.448246,1.513117,1.581904,1.488971,3.0,2.0,4.0
1,0.303962,-0.299018,-0.116865,0.448527,0.492411,0.445151,4.0,2.0,1.0
2,-0.887109,0.255771,-0.116865,-0.976942,-0.969326,-0.932110,4.0,1.0,2.0
3,0.888669,-0.391483,-0.116865,0.998866,1.037158,0.967061,4.0,3.0,3.0
4,0.910325,0.810560,-0.638569,1.016910,0.946367,1.068544,4.0,5.0,2.0
...,...,...,...,...,...,...,...,...,...
135496,-0.627239,0.625630,-0.116865,-0.561932,-0.606161,-0.526181,4.0,0.0,3.0
135497,2.816037,0.995489,-0.116865,2.144654,2.190205,2.286333,2.0,6.0,3.0
135498,-1.060355,0.810560,-1.681977,-1.274667,-1.250778,-1.193065,4.0,4.0,7.0
135499,-0.843797,-0.668877,-0.638569,-0.850635,-0.833139,-0.888618,4.0,1.0,5.0


In [24]:
x_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.995388,0.070841,-0.116865,-1.166403,-1.150908,-1.135075,4.0,0.0,3.0
1,-0.930420,-0.299018,-0.638569,-1.004008,-0.996563,-1.019095,4.0,0.0,2.0
2,1.559999,-0.761342,-0.638569,1.558227,1.527430,1.445478,4.0,3.0,3.0
3,-0.150811,-0.299018,0.926542,-0.002571,0.047535,-0.004271,2.0,4.0,2.0
4,-0.150811,1.180419,-0.638569,-0.011593,0.011218,0.097212,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
58067,-1.017044,1.272883,-0.116865,-1.220535,-1.196303,-1.106080,1.0,4.0,4.0
58068,-0.887109,-0.576412,-0.116865,-0.940854,-0.914851,-0.961105,4.0,5.0,4.0
58069,-1.017044,-0.391483,-0.638569,-1.130315,-1.159987,-1.164070,4.0,0.0,3.0
58070,-0.453992,-0.483948,-0.116865,-0.318339,-0.279313,-0.337713,4.0,3.0,5.0


In [25]:
""" linear regression
    ridge regression
    lasso regression
    elastic net
"""

' linear regression\n    ridge regression\n    lasso regression\n    elastic net\n'

In [26]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet # type: ignore
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error # type: ignore

In [27]:
import numpy as np
def evaluate_model(true_value,predicted_value):
    mae=mean_absolute_error(true_value,predicted_value)
    mse=mean_squared_error(true_value,predicted_value)
    rmse=np.sqrt(mse)
    rscore=r2_score(true_value,predicted_value)
    return mae,rmse,rscore

In [29]:
models={
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet()
}

In [31]:
model_list=[]
r2_list=[]

In [39]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #Make Predictions
    y_pred=model.predict(x_test)

    #this is a validation(test) score
    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 1014.9136406378977
MAE: 673.4234973219693
R2 score 93.74806057731074


Ridge
Model Training Performance
RMSE: 1014.9163243190184
MAE: 673.456340590975
R2 score 93.74802751393804


Lasso
Model Training Performance
RMSE: 1014.8238071073433
MAE: 674.5835453256957
R2 score 93.74916729006459


ElasticNet
Model Training Performance
RMSE: 1526.9362038140018
MAE: 1057.589298585818
R2 score 85.84863388147417




In [40]:
r2_list

[0.9374806057731074, 0.9374802751393805, 0.937491672900646, 0.8584863388147417]

In [41]:
model_list

['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']