In [9]:
import pandas as pd
import numpy as np
import sklearn

# Reading Dataset

In [2]:
df=pd.read_csv(r'..\dataset\Gemstones\cubic_zirconia.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


#### Dropping unrequired columns

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

#### Target and Predictor variables separated

In [5]:
X = df.drop('price', axis = 1)
y = df.price

#### Categorical features encoding mapping

In [7]:
numerical_columns = X.select_dtypes(include=np.number).columns
categorical_columns = X.select_dtypes(include = np.object).columns

In [8]:
cut_map=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
clarity_map=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
color_map= ['D', 'E', 'F', 'G', 'H', 'I', 'J']

# Data Preparation pipeline

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps= [
    ('imputation', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler())
    ]
)

# Categorical Pipeline
cat_pipeline = Pipeline(
    steps = [
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoding', OrdinalEncoder(categories=[cut_map, color_map, clarity_map])),
    ('scaling', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
    ]
)

#### Train-test split

In [14]:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=1033)

X_train=pd.DataFrame(preprocessor.fit_transform(X_train), columns=X_train.columns)
X_test=pd.DataFrame(preprocessor.transform(X_test), columns=X_test.columns)

# Model Training

In [27]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#### Manual process

In [57]:
regression=LinearRegression()
regression.fit(X_train, y_train)
print("Predicted values:\t",regression.predict(X_test))

Predicted values:	 [4196.23633048  -53.02311236 9735.0446373  ... 5516.98791272  508.87122163
 5276.67528972]


In [56]:
print('Intercept: ',regression.intercept_)
print('Coefficients: ',regression.coef_)

Intercept:  3944.7929646111465
Coefficients:  [5188.68090315 -115.96757356  -75.19539928 -971.67032229   12.26987337
  -35.22665321  124.57722674 -553.61244506  823.40929563]


# Automated Model training

In [77]:
def evaluate_model(true,pred):
    mse=mean_squared_error(true,pred)
    mae=mean_absolute_error(true,pred)
    r2 =r2_score(true,pred)
    return mse, mae, r2

def model_build_predict(algorithm, X_train, y_train, X_test):
    model = algorithm()
    model.fit(X_train,y_train)
    y_pred= model.predict(X_test)
    intercept = model.intercept_
    coeff = model.coef_
    return y_pred, intercept, coeff

algo_dict = {'LinearRegression':[LinearRegression],
            'Ridge':[Ridge], 
            'Lasso':[Lasso], 
            'ElasticNet':[ElasticNet]}

for algo_name,algo_list in algo_dict.items():
    algo = algo_list[0]
    y_pred, intercept, coeff = model_build_predict(algo, X_train, y_train, X_test)
    algo_list.extend([round(i,7) for i in evaluate_model(y_test, y_pred)])
    print (algo_name.upper())
    print ('MSE :\t', round(algo_list[1],2))
    print ('MAE :\t', round(algo_list[2],2))
    print ('R2  :\t', round(algo_list[3],5))
    print ('PARAMS:\n', '\tIntercept:',round(intercept,2),'\n\tcoefficients:\n\t',[round(i,2) for i in coeff])
    print ('-'*20, end='\n\n')

LINEARREGRESSION
MSE :	 1503428.45
MAE :	 810.92
R2  :	 0.90586
PARAMS:
 	Intercept: 3944.79 
	coefficients:
	 [5188.68, -115.97, -75.2, -971.67, 12.27, -35.23, 124.58, -553.61, 823.41]
--------------------

RIDGE
MSE :	 1503446.42
MAE :	 811.16
R2  :	 0.90586
PARAMS:
 	Intercept: 3944.79 
	coefficients:
	 [5181.21, -115.5, -75.14, -963.67, 11.96, -35.64, 124.6, -553.41, 823.51]
--------------------

LASSO
MSE :	 1503755.99
MAE :	 812.29
R2  :	 0.90584
PARAMS:
 	Intercept: 3944.79 
	coefficients:
	 [5140.92, -112.28, -73.75, -914.21, 0.0, -34.07, 124.59, -551.59, 823.62]
--------------------

ELASTICNET
MSE :	 2572667.96
MAE :	 1068.76
R2  :	 0.83891
PARAMS:
 	Intercept: 3944.79 
	coefficients:
	 [1200.9, -13.83, 2.11, 878.13, 697.14, 681.47, 99.34, -166.1, 414.77]
--------------------



# Model Evaluation

In [78]:
modelling_result=pd.DataFrame(algo_dict, index= ['Model', 'Mean Squared Error', 'Mean Absolute Error', 'R2_score']).iloc[1:,:].T
modelling_result

Unnamed: 0,Mean Squared Error,Mean Absolute Error,R2_score
LinearRegression,1503430.0,810.922,0.90586
Ridge,1503450.0,811.158,0.905858
Lasso,1503760.0,812.295,0.905839
ElasticNet,2572670.0,1068.76,0.838907


#### Best Model
(Based on r squared value)

In [79]:
modelling_result[modelling_result.R2_score==max(modelling_result.R2_score)]

Unnamed: 0,Mean Squared Error,Mean Absolute Error,R2_score
LinearRegression,1503430.0,810.922,0.90586
