In [57]:
import pandas as pd
import numpy as np

In [58]:
data = pd.read_csv(r"C:\FSDSMENDTOEND\notebook\data\cubic_zirconia.csv")
data.drop('Unnamed: 0', axis=1, inplace=True)

In [59]:
X = data.drop(labels=["price"], axis=1)
y = data[["price"]]


In [60]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70
2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65
...,...,...,...,...,...,...,...,...,...
26962,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09
26963,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
26964,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17
26965,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60


In [61]:
y

Unnamed: 0,price
0,499
1,984
2,6289
3,1082
4,779
...,...
26962,5408
26963,1114
26964,1656
26965,682


In [62]:
unique_carat_values = X['carat'].unique()
print(unique_carat_values)

unique_cut_values = X['cut'].unique()
print(unique_cut_values)
unique_color_values = X['color'].unique()
print(unique_color_values)
unique_clarity_values = X['clarity'].unique()
print(unique_clarity_values)

[0.3  0.33 0.9  0.42 0.31 1.02 1.01 0.5  1.21 0.35 0.32 1.1  0.71 1.5
 0.34 0.54 1.04 0.4  1.52 1.19 0.66 0.52 0.72 0.77 0.51 1.26 1.55 1.58
 0.43 2.   0.73 1.14 0.78 0.91 0.27 1.8  1.13 0.38 0.57 0.44 0.7  1.22
 0.59 1.2  2.16 1.68 0.76 0.74 0.41 1.51 1.69 1.3  1.   0.28 0.55 1.39
 0.36 0.23 1.37 0.81 2.02 2.8  1.56 2.2  0.75 1.71 1.11 0.92 1.45 1.16
 0.58 0.97 1.03 0.26 1.53 1.63 0.96 1.24 0.39 0.61 0.24 2.01 1.7  0.79
 0.67 1.28 0.25 0.56 1.09 2.11 1.17 0.82 0.53 0.46 2.43 1.65 0.84 1.74
 0.8  1.83 1.25 1.15 0.6  1.06 0.29 1.05 1.18 2.27 2.36 1.07 0.95 0.93
 2.48 1.23 2.03 1.27 0.83 1.43 0.45 1.12 1.59 0.62 1.61 2.04 1.33 0.37
 1.35 1.6  3.04 1.57 2.14 0.94 0.49 1.49 1.76 2.1  1.78 1.34 1.38 2.33
 2.51 2.05 0.87 1.79 1.73 0.69 1.32 1.86 2.61 1.72 1.66 3.01 0.63 2.06
 2.29 1.47 1.08 0.86 0.68 1.31 1.41 0.65 1.54 1.91 2.22 1.29 2.49 0.98
 1.44 0.64 1.87 0.47 2.56 2.28 1.9  1.67 2.45 1.82 1.64 0.48 2.39 2.21
 2.24 2.25 1.99 1.75 2.44 2.09 1.36 0.88 2.07 1.62 0.85 2.35 1.48 2.13
 0.89 4

In [63]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_col = X.select_dtypes(exclude='object').columns

In [64]:
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','WS2','WS1','IF', 'VVS1', 'VVS2']

In [65]:
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OrdinalEncoder 
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [66]:
# num pipepline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

In [67]:
# cat_pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))
])

In [68]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_col),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=30)


In [70]:

preprocessor.fit_transform(X_train)

array([[ 0.56167397,  0.31982286, -0.6510053 , ...,  4.        ,
         3.        ,  4.        ],
       [-0.60408689, -0.39681436,  1.58987791, ...,  2.        ,
         1.        ,  2.        ],
       [-0.52081826, -0.25348692, -1.09918195, ...,  4.        ,
         4.        ,  1.        ],
       ...,
       [ 1.87315495,  0.7498052 , -0.20282866, ...,  2.        ,
         4.        ,  4.        ],
       [-1.04124722,  1.03646009, -0.20282866, ...,  1.        ,
         0.        ,  1.        ],
       [-0.54163542, -0.68346925, -0.20282866, ...,  4.        ,
         0.        ,  2.        ]])

In [71]:
preprocessor.transform(X_train)

array([[ 0.56167397,  0.31982286, -0.6510053 , ...,  4.        ,
         3.        ,  4.        ],
       [-0.60408689, -0.39681436,  1.58987791, ...,  2.        ,
         1.        ,  2.        ],
       [-0.52081826, -0.25348692, -1.09918195, ...,  4.        ,
         4.        ,  1.        ],
       ...,
       [ 1.87315495,  0.7498052 , -0.20282866, ...,  2.        ,
         4.        ,  4.        ],
       [-1.04124722,  1.03646009, -0.20282866, ...,  1.        ,
         0.        ,  1.        ],
       [-0.54163542, -0.68346925, -0.20282866, ...,  4.        ,
         0.        ,  2.        ]])

In [72]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [73]:
# now doing the or showing the transform data
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [74]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.561674,0.319823,-0.651005,0.704838,0.692859,0.723432,4.0,3.0,4.0
1,-0.604087,-0.396814,1.589878,-0.533052,-0.486169,-0.533329,2.0,1.0,2.0
2,-0.520818,-0.253487,-1.099182,-0.409263,-0.376688,-0.410385,4.0,4.0,1.0
3,-0.999613,0.248159,-1.099182,-1.178523,-1.134635,-1.107067,4.0,4.0,4.0
4,0.228599,0.964796,-0.651005,0.413049,0.330729,0.477544,2.0,4.0,3.0
...,...,...,...,...,...,...,...,...,...
18871,-0.187744,-2.045080,2.038055,0.103577,0.044393,-0.150837,3.0,3.0,2.0
18872,0.436771,0.319823,0.245348,0.607575,0.558113,0.614148,3.0,3.0,2.0
18873,1.873155,0.749805,-0.202829,1.535992,1.543444,1.611360,2.0,4.0,4.0
18874,-1.041247,1.036460,-0.202829,-1.346523,-1.260959,-1.189030,1.0,0.0,1.0


In [75]:
X_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.833076,1.764954e-01,1.589878,-0.904419,-0.898829,-0.861180,3.0,1.0,2.0
1,0.291051,-9.701241e-01,0.693525,0.536838,0.532848,0.409242,3.0,1.0,1.0
2,0.624125,-3.968144e-01,-0.202829,0.819784,0.734967,0.709771,3.0,6.0,3.0
3,0.832297,-1.328443e+00,2.934408,0.978942,0.886556,0.737092,1.0,1.0,2.0
4,-1.041247,-2.534869e-01,0.693525,-1.293470,-1.210430,-1.243672,2.0,3.0,7.0
...,...,...,...,...,...,...,...,...,...
8086,-0.166927,-1.471770e+00,0.693525,0.032840,0.069658,-0.109855,2.0,1.0,1.0
8087,-0.166927,-9.701241e-01,0.245348,0.032840,0.069658,-0.055213,2.0,3.0,9.0
8088,-1.041247,8.214689e-01,-0.202829,-1.293470,-1.218851,-1.161709,2.0,2.0,3.0
8089,0.436771,-1.818232e-01,0.693525,0.660627,0.583378,0.586827,3.0,3.0,2.0


In [None]:
# its a regression problem
'liner regressin,lasso regression,ridge,Elasticenet'

In [84]:
# model import training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [85]:
# evlatue the model
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,mse,r2_square

In [86]:
## Train multiple models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

In [87]:
trained_model_list=[]
model_list=[]
r2_list=[]


In [88]:
for i in range (len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

In [95]:
trained_model_list


[LinearRegression(), Lasso(), Ridge(), ElasticNet()]

In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv(r"C:\FSDSMENDTOEND\notebook\data\cubic_zirconia.csv")

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)


In [None]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70,984
2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779
...,...,...,...,...,...,...,...,...,...,...
26962,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09,5408
26963,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74,1114
26964,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17,1656
26965,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60,682


In [None]:
X=data.drop(labels=["price"],axis=1)

In [None]:
y=data[["price"]]

In [None]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70
2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65
...,...,...,...,...,...,...,...,...,...
26962,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09
26963,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
26964,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17
26965,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60


In [None]:
print(data['carat'])


0        0.30
1        0.33
2        0.90
3        0.42
4        0.31
         ... 
26962    1.11
26963    0.33
26964    0.51
26965    0.27
26966    1.25
Name: carat, Length: 26967, dtype: float64


In [None]:
unique_carat_values = data['carat'].unique()
print(unique_carat_values)


[0.3  0.33 0.9  0.42 0.31 1.02 1.01 0.5  1.21 0.35 0.32 1.1  0.71 1.5
 0.34 0.54 1.04 0.4  1.52 1.19 0.66 0.52 0.72 0.77 0.51 1.26 1.55 1.58
 0.43 2.   0.73 1.14 0.78 0.91 0.27 1.8  1.13 0.38 0.57 0.44 0.7  1.22
 0.59 1.2  2.16 1.68 0.76 0.74 0.41 1.51 1.69 1.3  1.   0.28 0.55 1.39
 0.36 0.23 1.37 0.81 2.02 2.8  1.56 2.2  0.75 1.71 1.11 0.92 1.45 1.16
 0.58 0.97 1.03 0.26 1.53 1.63 0.96 1.24 0.39 0.61 0.24 2.01 1.7  0.79
 0.67 1.28 0.25 0.56 1.09 2.11 1.17 0.82 0.53 0.46 2.43 1.65 0.84 1.74
 0.8  1.83 1.25 1.15 0.6  1.06 0.29 1.05 1.18 2.27 2.36 1.07 0.95 0.93
 2.48 1.23 2.03 1.27 0.83 1.43 0.45 1.12 1.59 0.62 1.61 2.04 1.33 0.37
 1.35 1.6  3.04 1.57 2.14 0.94 0.49 1.49 1.76 2.1  1.78 1.34 1.38 2.33
 2.51 2.05 0.87 1.79 1.73 0.69 1.32 1.86 2.61 1.72 1.66 3.01 0.63 2.06
 2.29 1.47 1.08 0.86 0.68 1.31 1.41 0.65 1.54 1.91 2.22 1.29 2.49 0.98
 1.44 0.64 1.87 0.47 2.56 2.28 1.9  1.67 2.45 1.82 1.64 0.48 2.39 2.21
 2.24 2.25 1.99 1.75 2.44 2.09 1.36 0.88 2.07 1.62 0.85 2.35 1.48 2.13
 0.89 4

In [None]:
unique_cut_values=data['cut'].unique()
print(unique_cut_values)

['Ideal' 'Premium' 'Very Good' 'Good' 'Fair']


In [None]:
unique_color_values=data['color'].unique()
print(unique_color_values)

['E' 'G' 'F' 'D' 'H' 'J' 'I']


In [None]:
unique_clarity_values=data['clarity'].unique()
print(unique_clarity_values)

['SI1' 'IF' 'VVS2' 'VS1' 'VVS1' 'VS2' 'SI2' 'I1']


In [None]:
y

Unnamed: 0,price
0,499
1,984
2,6289
3,1082
4,779
...,...
26962,5408
26963,1114
26964,1656
26965,682


In [None]:
categorical_cols=X.select_dtypes(include='object').columns

In [None]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [None]:
numerical_col=X.select_dtypes(exclude='object').columns

In [None]:
numerical_col

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [None]:
# segrate the cat and num
X.select_dtypes(include='object')

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI1
1,Premium,G,IF
2,Very Good,E,VVS2
3,Ideal,F,VS1
4,Ideal,F,VVS1
...,...,...,...
26962,Premium,G,SI1
26963,Ideal,H,IF
26964,Premium,E,VS2
26965,Very Good,F,VVS2


In [None]:
# Define the custome ranking for each ordinal variable
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','WS2','WS1','IF']

In [None]:
from sklearn.impute import SimpleImputer ## handling the missing values
from sklearn.preprocessing import StandardScaler ## standardizing the data # handling  feature scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
#pipe line 
num_pipeline=Pipeline(

    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler())
    ]
  
 )

In [None]:
cat_pipeline=Pipeline(

    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))

    ]
)

In [None]:
preprocessor=ColumnTransformer(

    [
        ('num_pipeline',num_pipeline,numerical_col),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [None]:
## train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=30)

In [103]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

# Train and evaluate each model
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print('R2 score', r2_square * 100)

    r2_list.append(r2_square)

    trained_model_list.append(model)

    print('=' * 35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1514235.6810426922
MAE: 834.5657751536753
R2 score 90.68727752593722


Lasso
Model Training Performance
RMSE: 1517338.6231444255
MAE: 836.5567165966868
R2 score 90.6681940774303


Ridge
Model Training Performance
RMSE: 1514700.488954004
MAE: 834.8716560443316
R2 score 90.68441890416784


ElasticNet
Model Training Performance
RMSE: 2636872.254931853
MAE: 1091.7042767422076
R2 score 83.7829343099173




In [102]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score

# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []
accuracy_list = []

# Train and evaluate each model
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    mae, rmse, _ = evaluate_model(y_test, y_pred)

    # Calculate R2 score
    r2 = r2_score(y_test, y_pred)

    # Calculate accuracy score (for regression models, this is not applicable, so we'll use R2 score as a proxy)
    accuracy = r2

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print('R2 score', r2 * 100)
    print('Accuracy score', accuracy * 100)

    r2_list.append(r2)
    accuracy_list.append(accuracy)

    trained_model_list.append(model)

    print('=' * 35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1514235.6810426922
MAE: 834.5657751536753
R2 score 90.68727752593722
Accuracy score 90.68727752593722


Lasso
Model Training Performance
RMSE: 1517338.6231444255
MAE: 836.5567165966868
R2 score 90.6681940774303
Accuracy score 90.6681940774303


Ridge
Model Training Performance
RMSE: 1514700.488954004
MAE: 834.8716560443316
R2 score 90.68441890416784
Accuracy score 90.68441890416784


ElasticNet
Model Training Performance
RMSE: 2636872.254931853
MAE: 1091.7042767422076
R2 score 83.7829343099173
Accuracy score 83.7829343099173




In [104]:
trained_model_list

[LinearRegression(), Lasso(), Ridge(), ElasticNet()]

In [105]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [106]:
r2_list

[0.9068727752593722, 0.906681940774303, 0.9068441890416784, 0.8378293430991729]

In [107]:
accuracy_list

[0.9068727752593722, 0.906681940774303, 0.9068441890416784, 0.8378293430991729]