In [18]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [4]:
df = pd.read_csv(r"D:\ML\projects\Gemstone\notebooks\data\gemstone.csv")

In [5]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
df.drop(columns=['id'],inplace=True)

In [7]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [8]:
X = df.drop(columns=['price'])
X.shape

(193573, 9)

In [9]:
y = df['price']
y.shape

(193573,)

In [10]:
categorical_columns = X.select_dtypes(include='object').columns
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [11]:
numerical_columns = X.select_dtypes(exclude='object').columns
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [12]:
df['clarity'].unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [13]:
cut_cat = ['Fair','Good','Very Good','Premium','Ideal']
color_cat = ['D','E','F','G','H','I','J']
clarity_cat = ['IF', 'VVS1', 'VVS2', 'VS1', 'VS2',  'SI1', 'SI2',  'I1']

In [14]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler())
    ]
)

In [15]:
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoding',OrdinalEncoder(categories=[cut_cat,color_cat,clarity_cat]))
    ]
)

In [16]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_columns),
        ('cat_pipeline',cat_pipeline,categorical_columns)
    ]
)

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=32)

In [18]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((154858, 9), (38715, 9), (154858,), (38715,))

In [19]:
preprocessor.fit_transform(X_train)

array([[-0.99620193, -2.23548775,  1.44197583, ...,  3.        ,
         4.        ,  4.        ],
       [-0.19722643,  0.71981039, -1.16242997, ...,  4.        ,
         5.        ,  3.        ],
       [-0.8450444 , -0.01901415, -0.64154881, ...,  4.        ,
         3.        ,  2.        ],
       ...,
       [-0.88823227, -0.57313255, -0.64154881, ...,  2.        ,
         3.        ,  4.        ],
       [ 0.47218547,  1.36628185,  1.44197583, ...,  3.        ,
         1.        ,  5.        ],
       [ 0.60174906, -0.20372028, -0.12066765, ...,  4.        ,
         4.        ,  6.        ]])

In [20]:
preprocessor.transform(X_test)

array([[-1.08257766, -0.38842641,  0.92109467, ...,  2.        ,
         0.        ,  3.        ],
       [-0.19722643, -1.86607548,  1.44197583, ...,  2.        ,
         5.        ,  6.        ],
       [-1.03938979, -0.85019175, -0.12066765, ...,  4.        ,
         0.        ,  3.        ],
       ...,
       [ 0.47218547,  0.25804505,  0.40021351, ...,  3.        ,
         2.        ,  4.        ],
       [-1.03938979, -1.21960401,  0.92109467, ...,  3.        ,
         3.        ,  2.        ],
       [ 2.71795443,  1.08922265, -0.64154881, ...,  3.        ,
         2.        ,  6.        ]])

In [21]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [22]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=X_test.columns)

In [23]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-1.082578,-0.388426,0.921095,-1.312745,-1.307333,-1.326729,2.0,0.0,3.0
1,-0.197226,-1.866075,1.441976,0.029854,0.107291,-0.080010,2.0,5.0,6.0
2,-1.039390,-0.850192,-0.120668,-1.195605,-1.189448,-1.239749,4.0,0.0,3.0
3,-0.154039,-0.573133,-0.120668,0.056887,0.080087,0.006970,4.0,4.0,2.0
4,0.493779,0.719810,0.400214,0.642584,0.596969,0.688317,3.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...
38710,-1.039390,0.627457,-0.120668,-1.231648,-1.225720,-1.181762,4.0,1.0,5.0
38711,-1.039390,0.627457,-0.641549,-1.186595,-1.180380,-1.123775,4.0,3.0,2.0
38712,0.472185,0.258045,0.400214,0.633574,0.587900,0.644826,3.0,2.0,4.0
38713,-1.039390,-1.219604,0.921095,-1.177584,-1.243856,-1.283239,3.0,3.0,2.0


In [42]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [25]:
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()

}

In [48]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    
    print(list(models.keys())[i])
    print('-'*10)
    print('mse =', mean_squared_error(y_test,y_pred))
    print('mae =', mean_absolute_error(y_test,y_pred))
    print('r2 =', r2_score(y_test,y_pred))
    print('rmse =', np.sqrt(mean_squared_error(y_test,y_pred)))

    print('='*40)
    print('\n')

LinearRegression
----------
mse = 1038699.3483466238
mae = 677.8454830976056
r2 = 0.9353718686958897
rmse = 1019.1660062750444


Lasso
----------
mse = 1038984.5249432425
mae = 679.0548195865002
r2 = 0.935354124937255
rmse = 1019.3059035163303


Ridge
----------
mse = 1038681.1448119304
mae = 677.8707856961937
r2 = 0.9353730013243373
rmse = 1019.1570756325692


ElasticNet
----------
mse = 2250010.1012708656
mae = 1044.303880852743
r2 = 0.8600038129493638
rmse = 1500.0033670865096




In [3]:
from src.DiamondPricePrediction.components.data_transformation import DataTransformation,DataTransformationConfig
from src.DiamondPricePrediction.components.data_ingestion import DataIngestion

In [4]:
obj1 = DataTransformation()
obj2 = DataTransformationConfig()

In [5]:
train = "D:\\ML\\projects\\Gemstone\\artifacts\\train.csv"
test = "D:\\ML\\projects\\Gemstone\\artifacts\\test.csv"

In [6]:
import pandas as pd
import numpy as np

train_df = pd.read_csv(train)
test_df = pd.read_csv(test)


# creating an object for preprocessor

preprocessor_obj = obj1.get_data_transformation()

target_column_name = 'price'
drop_columns = [target_column_name,'id']

input_feature_train_df = train_df.drop(columns=drop_columns,axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=drop_columns,axis=1)
target_feature_test_df = test_df[target_column_name]



input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessor_obj.transform(input_feature_test_df)

train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]


In [10]:
train_df.shape,test_df.shape

((145179, 11), (48394, 11))

In [11]:
len(train_arr),len(test_arr)

(145179, 48394)

In [12]:
X_train,y_train,X_test,y_test=(
                train_arr[:,:-1],
                train_arr[:,-1],
                test_arr[:,:-1],
                test_arr[:,-1])

In [13]:
len(X_train)

145179

In [1]:
from src.DiamondPricePrediction.pipelines.prediction_pipeline import CustomData

In [3]:
obj = CustomData(1.13,62.7,58.0,6.62,6.66,4.17,'Premium','H','SI1')

In [8]:
data = obj.get_data_as_dataframe()

In [9]:
print(data)

   carat  depth  table     x     y     z      cut color clarity
0   1.13   62.7   58.0  6.62  6.66  4.17  Premium     H     SI1
