In [52]:
import pandas as pd
import numpy as np

In [54]:
data = pd.read_csv('D:\\study\\Data_science\\code\\machine_learning\\end_to_end_project\\notebooks\data\\gemstone.csv')

In [55]:
data.drop(labels=["id"],axis=1,inplace=True)

In [56]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [117]:
X=data.drop(labels=["price"],axis=1)
y=data["price"]

In [None]:
X.head()

In [59]:
numerical_cols=X.select_dtypes(exclude='object').columns

In [60]:
categorical_cols=X.select_dtypes(include='object').columns

In [None]:
X.head(3)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
cut_map = ['Fair','Good','Very Good','Ideal','Premium']
clarity_map = ["I1","SI2" ,"SI1" ,"VS2" , "VS1" , "VVS2" , "VVS1" ,"IF"]
color_map = ["D" ,"E" ,"F" , "G" ,"H" , "I", "J"]

In [66]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [67]:
num_pipeline=Pipeline(
    
    
    steps=[
        
        ('imputer',SimpleImputer()),
        ('scaler', StandardScaler())
    ]
    
    
)

In [68]:
cat_pipeline=Pipeline(
    
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
    
)

In [None]:
cat_pipeline

In [69]:
preprocessor=ColumnTransformer(
    [
        
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [62]:
preprocessor

In [63]:
from sklearn.model_selection import  train_test_split

In [118]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=30)

In [119]:
preprocessor.fit_transform(X_train)

array([[-0.54220555,  2.11279474, -1.68414474, ...,  1.        ,
         1.        ,  4.        ],
       [-0.15318745,  0.44710281, -1.16300271, ...,  4.        ,
         2.        ,  1.        ],
       [ 1.57578188, -2.14397354,  0.92156539, ...,  2.        ,
         2.        ,  2.        ],
       ...,
       [ 0.45195182,  1.5575641 , -0.64186069, ...,  1.        ,
         3.        ,  2.        ],
       [ 0.66807298, -1.77381977,  1.44270741, ...,  4.        ,
         3.        ,  4.        ],
       [ 0.25744277,  0.81725657, -0.12071866, ...,  4.        ,
         3.        ,  2.        ]])

In [120]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [121]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [131]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [147]:
trained_model_list=[]
model_list=[]
r2_list=[]
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    mae , mse ,r2_square = evaluate_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)
    r2_list.append(r2_square)
    trained_model_list.append(model.score(X_train,y_train))
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1520.0186486838768
MAE: 675.5483623703055
R2 score 93.67464279030936


Lasso
Model Training Performance
RMSE: 1520.0186486838768
MAE: 676.8315267472232
R2 score 93.67270723445247


Ridge
Model Training Performance
RMSE: 1520.0186486838768
MAE: 675.5753135470308
R2 score 93.67463565166098


ElasticNet
Model Training Performance
RMSE: 1520.0186486838768
MAE: 1053.9094724261593
R2 score 85.83594443217535




In [155]:
model_train_df = pd.DataFrame(list(zip(model_list,trained_model_list,r2_list)),columns=['model_list','trained_model_list','r2_score'])

In [156]:
model_train_df

Unnamed: 0,model_list,trained_model_list,r2_score
0,LinearRegression,0.936711,0.936746
1,Lasso,0.936696,0.936727
2,Ridge,0.936711,0.936746
3,ElasticNet,0.858474,0.858359
