# Model Training

In [92]:
import pandas as pd
df = pd.read_csv('./data/diamonds.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [93]:
df=df.drop(labels=['id'],axis=1)

In [94]:
# independent and Depenpent Features

X = df.drop(labels=['price'], axis=1)
Y = df = df[['price']]

In [95]:
# Define which columns should be ordinal-encoded and which should be scaled

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [96]:
# Define the custom ranking for each ordinal variable

cut_categories = ['Fair', "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ['I1',"SI2","SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

In [97]:
from sklearn.impute import SimpleImputer # Handing Missing Values
from sklearn.preprocessing import StandardScaler # Handing Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encoding
## Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # Column


In [98]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
          ('imputer', SimpleImputer(strategy='median')),
          ('scaler', StandardScaler())
          ]
)

## Category Pipeline

cat_pipeline = Pipeline(
    steps=[
          ('imputer', SimpleImputer(strategy='most_frequent')),
          ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
          ('scaler', StandardScaler())
          ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline',cat_pipeline, categorical_cols),
    ]
)

In [99]:
## Train Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

In [100]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [101]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.817406,1.290192,-0.650469,-0.893064,-0.897014,-0.758306,-1.709604,-0.93989,-0.640054
1,0.048948,0.102662,-0.650469,0.23294,0.248832,0.256665,0.981376,-1.528664,-0.640054
2,-0.479316,-0.386321,-0.201363,-0.374745,-0.332771,-0.391789,-0.812611,-0.93989,-1.248906
3,-0.204619,0.102662,0.247742,-0.017284,-0.05499,-0.025271,0.084383,-1.528664,-1.248906
4,0.450429,-0.386321,0.247742,0.679767,0.6221,0.594989,0.084383,0.237658,1.186501


In [102]:
X_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,1.485828,-3.18051,2.044163,1.707469,1.629056,1.158861,-1.709604,1.415207,0.57765
1,1.528089,-0.106902,-0.650469,1.466182,1.472804,1.440798,-0.812611,0.237658,-0.031202
2,0.217993,1.290192,-0.201363,0.331242,0.266194,0.45402,-1.709604,0.237658,-1.248906
3,0.746257,-0.037048,0.696847,0.849561,0.865159,0.848731,0.981376,1.415207,-0.031202
4,-1.007581,0.242371,-1.099574,-1.223716,-1.157433,-1.153017,0.981376,-0.93989,-0.031202


## Model Training

In [103]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error