In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [2]:
# data ingestion
data = pd.read_csv('data/gemstone.csv')
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


,'x','y','z','depth'

In [3]:
data.drop(['id'],axis = 1,inplace = True)

In [4]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
# Independent and dependent features
x = data.drop('price',axis = 1)
y = data[['price']]

In [6]:
# Define the columns which should be ordinal encoded and which should be scaled
numerical_columns = x.columns[x.dtypes!='object']
categorical_columns = x.columns[x.dtypes=='object']

In [7]:
# Define the custom ranking for each ordinal variable
cut_categories = [
    'Fair',
    'Good',
    'Very Good',
    'Premium',
    'Ideal'
]
color_categories = [
    'D','E',
    'F','G',
    'H','I','J'
]    
clarity_categories = [
    'I1','SI2','SI1',
    'VS2','VS1','VVS2', 
    'VVS1','IF' 
]

In [8]:
# numerical pipeline
num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('oridinalencoder',OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler',StandardScaler())
        
    ]
)

In [9]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('categorical_pipeline', categorical_pipeline, categorical_columns)
])

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x, y, 
                                                test_size=0.25, random_state=42)

In [11]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns = preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns = preprocessor.get_feature_names_out())

In [12]:
x_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,0.472019,0.257973,-0.640440,0.606460,0.624162,0.644271,0.873371,1.527634,-1.315033
1,0.623099,0.812463,-1.161268,0.732515,0.787278,0.803549,0.873371,0.911526,0.017494
2,2.651895,-1.867571,1.442873,2.236173,2.318751,2.005371,-0.132110,1.527634,0.017494
3,-1.017203,1.089708,-0.640440,-1.248351,-1.233542,-1.151223,-1.137590,0.295418,-0.648770
4,-0.110720,1.459368,3.526185,0.039212,-0.010176,0.137478,-2.143070,-0.936798,-1.315033
...,...,...,...,...,...,...,...,...,...
145174,-0.628711,-1.497911,1.963701,-0.546044,-0.517646,-0.644430,-1.137590,-0.936798,-0.648770
145175,2.414483,0.442803,2.484529,1.921035,1.874714,1.932972,-1.137590,-0.320690,-0.648770
145176,0.925260,0.904878,0.401217,0.993629,0.923207,1.049705,-0.132110,0.295418,0.017494
145177,-1.038786,-0.666177,-0.640440,-1.212335,-1.197294,-1.252581,-1.137590,0.295418,2.016286


In [13]:
regression = LinearRegression()
regression.fit(x_train,y_train)

In [14]:
regression.coef_

array([[ 6430.67767479,  -131.82597587,   -71.57263787, -1756.49729491,
         -430.33020445,   -87.5346621 ,    67.85347115,  -464.01098494,
          651.10737279]])

In [15]:
regression.intercept_

array([3972.75199581])

In [16]:
def evaluate_model(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae = mean_absolute_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square


In [17]:
# Train multiple models
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Random_forest':RandomForestRegressor(random_state = 42, oob_score = True)
}

In [18]:
trained_model_list=[]
model_list=[]
r2_list=[]

In [19]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    
    # predictions
    y_pred = model.predict(x_test)

    mae,rmse,r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*30)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1018.1581640245518
MAE: 675.8646403365859
R2 score 93.61253831282487


Lasso
Model Training Performance
RMSE: 1018.2756930699593
MAE: 677.07214907871
R2 score 93.61106358005803


Ridge
Model Training Performance
RMSE: 1018.1611429116626
MAE: 675.8935091374923
R2 score 93.61250093640183


Elasticnet
Model Training Performance
RMSE: 1538.6866758139975
MAE: 1062.8025492504894
R2 score 85.41191791514899




  return fit_method(estimator, *args, **kwargs)


Random_forest
Model Training Performance
RMSE: 611.387978925198
MAE: 309.5469240741178
R2 score 97.69680025885195




In [20]:
range(len(list(models)))

range(0, 5)