In [2]:
import pandas as pd

## Model Training

In [3]:
df=pd.read_csv('data/train.csv')

In [4]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
df= df.drop(labels=['id'],axis=1)

In [7]:
## Independent and dependent features 
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [10]:
# Define which columns should be ordinal-encoded and which should be scaled 
categorical_cols= X.select_dtypes(include='object').columns
numerical_cols= X.select_dtypes(exclude='object').columns

In [12]:
# Define the custom ranking for each ordinal variable 
cut_categories = ['Fair','Good',"Very Good","Premium","Ideal"]
color_categories= ["J","I","H","G","F","E","D"]
clarity_categories= ['I1',"SI2","SI1","VS2","VS1",
             "VVS2","VVS1","IF"]

In [11]:
from sklearn.impute import SimpleImputer # handle missing value
from sklearn.preprocessing import StandardScaler# handling feature scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encoding 
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median'))  ,
    ('scaler',StandardScaler())  
    ]
)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)




In [15]:
# Train Test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [16]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test= pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [17]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,-0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,-0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,-1.52672,-0.648127


In [18]:
Y_train.head()

Unnamed: 0,price
11504,1181
95284,7418
184777,12755
5419,1020
45466,445


In [21]:
# model training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error


In [22]:
regression = LinearRegression()
regression.fit(X_train,Y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [23]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,   464.67990411,
          652.10059539]])

In [28]:
regression.intercept_

array([3976.8787389])

In [32]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.874100,0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,-0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,-0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.874100,0.320880,2.017037
4,-0.995648,0.258230,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,-1.526720,-0.648127
...,...,...,...,...,...,...,...,...,...
135496,-0.629077,-1.500179,1.964434,-0.546492,-0.518125,-0.644575,-1.137644,0.936747,-0.648127
135497,2.411307,0.443325,2.485700,1.919078,1.872797,1.930288,-1.137644,0.320880,-0.648127
135498,0.923460,0.906065,0.400636,0.992240,0.921862,1.047891,-0.131772,-0.294987,0.018164
135499,-1.038774,-0.667249,-0.641897,-1.212375,-1.197364,-1.252127,-1.137644,-0.294987,2.017037


In [29]:
import numpy as np 
def evaluate_model(true,prediction):
    mae = mean_absolute_error(true,prediction)
    mse= mean_squared_error(true,prediction)
    rmse= np.sqrt(mean_squared_error(true,prediction))
    r2_square= r2_score (true,prediction)
    return mae,rmse,r2_square,mse

In [35]:
# Train multiple model 
models={'LinearRegression':LinearRegression(),
        'Lasso':Lasso(),
        'Ridge':Ridge(),
        'ElasticNet':ElasticNet()
        }

model_list=[]
r2_list=[]
for i in range (len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,Y_train)
    
    # Make Predictions
    y_pred=model.predict(X_test)
    evaluate_model(Y_test,y_pred)
    mae,rmse,r2_square,mse = evaluate_model(Y_test,y_pred)
    print(list(models.keys())[i]) 
    model_list.append(list(models.keys())[i])
    
    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE",mae)
    print("MSE",mse)
    print("r2Square:",r2_square*100)
    
    r2_list.append(r2_square)
    print('='*35)
    print('\n')    
      

LinearRegression
Model Training Performance
RMSE: 1014.6296630375463
MAE 675.0758270067479
MSE 1029473.3531156846
r2Square: 93.62906819996049


Lasso
Model Training Performance
RMSE: 1014.6591302750638
MAE 676.2421173665509
MSE 1029533.150650549
r2Square: 93.62869814082755


Ridge
Model Training Performance
RMSE: 1014.6343233534399
MAE 675.1077629781348
MSE 1029482.8101268928
r2Square: 93.62900967491633


ElasticNet
Model Training Performance
RMSE: 1533.3541245902313
MAE 1060.9432977143006
MSE 2351174.8713978743
r2Square: 85.44967219374031




In [36]:
model_list


['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']