In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
df= pd.read_csv(r"D:\Pycharm\diamond_price_predictor\notebook\data\dataset\train.csv")

In [4]:
df.shape

(193573, 11)

In [5]:
X = df.drop(labels = ['price'] , axis = 1)
y = df[['price']]

In [6]:
num_cols = X.columns[X.dtypes != 'object']
cat_cols = X.columns[X.dtypes == 'object']

print('Numerical Columns = ' , num_cols)
print('Categorical Columns = ' , cat_cols)

Numerical Columns =  Index(['id', 'carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Categorical Columns =  Index(['cut', 'color', 'clarity'], dtype='object')


In [7]:
# Ranking 'cut' values according to the cut quality
cut_categories = ['Fair',  'Good', 'Very Good' , 'Premium' , 'Ideal' ]
#Ideal given the highest order and fair being the lowest

# Ranking clarity values
clarity_categories = ['I1', 'SI2', 'SI1' , 'VS2' , 'VS1' , 'VVS2' , 'VVS1' , 'IF' ]
# I1 given the lowest rank and IF the highest

# Ranking the color values
color_categories = ['D' , 'E' , 'F' , 'G' , 'H' , 'I' , 'J']
# lowest to highest rank                  

### Creating Pipelines

In [8]:
# Numerical Pipelines
num_pipeline = Pipeline(
        steps  = [
            ('imputer', SimpleImputer(strategy= 'median')),
            ('scaler' , StandardScaler())
        ]
)


# Categorical Pipelines
cat_pipeline = Pipeline(
            steps = [
                ('imputer' ,SimpleImputer(strategy= 'most_frequent')),
                ('ordinal_encoder' , OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
                ('scaler' , StandardScaler() )
                
            ]
)


preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols ),
    ('cat_pipeline' , cat_pipeline, cat_cols)
]
)

### Train test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1613)

### Transforming

In [10]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns= preprocessor.get_feature_names_out())

X_test = pd.DataFrame(preprocessor.transform(X_test), columns= preprocessor.get_feature_names_out())

In [11]:
X_train.head()

Unnamed: 0,num_pipeline__id,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.666604,-1.041934,0.353366,-2.101834,-1.26995,-1.246037,-1.211727,0.873353,-0.32126,0.017539
1,-1.215055,0.235044,-0.384779,0.401858,0.480094,0.433513,0.427076,-1.141188,0.294812,-1.314206
2,1.530523,-0.132899,-0.753852,0.923461,0.101219,0.061288,0.006498,-0.133918,-1.553405,0.017539
3,1.141436,-0.02468,0.261098,-0.641347,0.137302,0.179311,0.166028,0.873353,0.294812,-0.648334
4,-1.717761,-0.19783,0.906975,-0.119744,-0.061157,-0.029498,0.035504,-1.141188,-0.937333,-0.648334


### Model Selection

In [12]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [13]:
print(linear_reg.coef_)

[[ 4.65143976e-01  6.39917723e+03 -1.31585343e+02 -6.77787455e+01
  -1.74540487e+03 -4.42166999e+02 -6.48006016e+01  7.13223594e+01
  -4.66512654e+02  6.55150439e+02]]


In [14]:
print(linear_reg.intercept_)

[3972.91352831]


In [15]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_sq = r2_score(true, predicted)
    
    return mae, rmse, r2_sq

In [16]:
models = {
    'linear_reg' : LinearRegression(),
    'lasso' : Lasso(),
    'ridge' : Ridge(),
    'elastic_net' : ElasticNet()
}

trained_model = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model Training Performance')
    print('RMSE :' , rmse)
    print('MAE :', mae)
    print('R2_score :' , r2_square*100)
    
    r2_list.append(r2_square)
    
    print('=========================================')
    print('\n')
    

linear_reg
Model Training Performance
RMSE : 1020.1817969017212
MAE : 680.288729817892
R2_score : 93.6638767066512


lasso
Model Training Performance
RMSE : 1020.3671297450252
MAE : 681.4981941395923
R2_score : 93.66157437502015


ridge
Model Training Performance
RMSE : 1020.1867804981565
MAE : 680.3222438426384
R2_score : 93.6638148024716


elastic_net
Model Training Performance
RMSE : 1545.4507373364113
MAE : 1065.4789720149554
R2_score : 85.45951772333332


