In [None]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score


In [None]:
df = sns.load_dataset('diamonds')

In [None]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
df.drop(['x','y','z'], inplace = True, axis = 1)

In [None]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335
...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757
53936,0.72,Good,D,SI1,63.1,55.0,2757
53937,0.70,Very Good,D,SI1,62.8,60.0,2757
53938,0.86,Premium,H,SI2,61.0,58.0,2757


In [None]:
df['cut'].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [None]:
df = pd.get_dummies(df, columns=['cut', 'color', 'clarity'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          53940 non-null  float64
 1   depth          53940 non-null  float64
 2   table          53940 non-null  float64
 3   price          53940 non-null  int64  
 4   cut_Ideal      53940 non-null  bool   
 5   cut_Premium    53940 non-null  bool   
 6   cut_Very Good  53940 non-null  bool   
 7   cut_Good       53940 non-null  bool   
 8   cut_Fair       53940 non-null  bool   
 9   color_D        53940 non-null  bool   
 10  color_E        53940 non-null  bool   
 11  color_F        53940 non-null  bool   
 12  color_G        53940 non-null  bool   
 13  color_H        53940 non-null  bool   
 14  color_I        53940 non-null  bool   
 15  color_J        53940 non-null  bool   
 16  clarity_IF     53940 non-null  bool   
 17  clarity_VVS1   53940 non-null  bool   
 18  clarit

In [None]:
df.replace({False: 0, True: 1}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          53940 non-null  float64
 1   depth          53940 non-null  float64
 2   table          53940 non-null  float64
 3   price          53940 non-null  int64  
 4   cut_Ideal      53940 non-null  int64  
 5   cut_Premium    53940 non-null  int64  
 6   cut_Very Good  53940 non-null  int64  
 7   cut_Good       53940 non-null  int64  
 8   cut_Fair       53940 non-null  int64  
 9   color_D        53940 non-null  int64  
 10  color_E        53940 non-null  int64  
 11  color_F        53940 non-null  int64  
 12  color_G        53940 non-null  int64  
 13  color_H        53940 non-null  int64  
 14  color_I        53940 non-null  int64  
 15  color_J        53940 non-null  int64  
 16  clarity_IF     53940 non-null  int64  
 17  clarity_VVS1   53940 non-null  int64  
 18  clarit

In [None]:
df.head()

Unnamed: 0,carat,depth,table,price,cut_Ideal,cut_Premium,cut_Very Good,cut_Good,cut_Fair,color_D,...,color_I,color_J,clarity_IF,clarity_VVS1,clarity_VVS2,clarity_VS1,clarity_VS2,clarity_SI1,clarity_SI2,clarity_I1
0,0.23,61.5,55.0,326,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0.21,59.8,61.0,326,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.23,56.9,65.0,327,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.29,62.4,58.0,334,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,0.31,63.3,58.0,335,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0


In [None]:
columns_to_scale = ['carat','depth','table']
numeric_data = df[columns_to_scale]
scaler = StandardScaler()
scaler.fit(numeric_data)
scaled_data = scaler.transform(numeric_data)
df[columns_to_scale] = scaled_data
df.head()

Unnamed: 0,carat,depth,table,price,cut_Ideal,cut_Premium,cut_Very Good,cut_Good,cut_Fair,color_D,...,color_I,color_J,clarity_IF,clarity_VVS1,clarity_VVS2,clarity_VS1,clarity_VS2,clarity_SI1,clarity_SI2,clarity_I1
0,-1.198168,-0.174092,-1.099672,326,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,-1.240361,-1.360738,1.585529,326,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-1.198168,-3.385019,3.375663,327,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,-1.071587,0.454133,0.242928,334,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,-1.029394,1.082358,0.242928,335,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0


In [None]:
price_correlation = (df.corr()['price']).sort_values(key=abs)
print(price_correlation)

clarity_I1      -0.000255
cut_Good        -0.000312
clarity_VS2     -0.001062
cut_Very Good    0.006593
color_G          0.008556
clarity_SI1      0.008957
clarity_VS1     -0.009886
depth           -0.010647
cut_Fair         0.018728
color_F         -0.024161
clarity_IF      -0.049596
clarity_VVS2    -0.052381
color_H          0.059223
color_D         -0.072473
color_J          0.081710
clarity_VVS1    -0.095266
cut_Premium      0.095706
color_I          0.097125
cut_Ideal       -0.097175
color_E         -0.101089
table            0.127134
clarity_SI2      0.128420
carat            0.921591
price            1.000000
Name: price, dtype: float64


In [None]:
df = df.sample(frac=1, random_state=50)
y = df['price']
X = df.drop('price', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state = 50)

print(X_train.shape)
print(y_train.shape)

(43152, 23)
(43152,)


In [None]:
# model_lg = LinearRegression()

# model_lg.fit(X_train, y_train)
# y_pred = model_lg.predict(X_test)
# model_lg_score = model_lg.score(X_test, y_test)

# print(model_lg_score)

In [None]:
parameters = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}
gridSearch = GridSearchCV(estimator=LinearRegression(), param_grid=parameters, cv=5, n_jobs=-1)
gridSearch.fit(X_train, y_train)
bestModel = gridSearch.best_estimator_
y_pred = bestModel.predict(X_test)
bestModel_score = bestModel.score(X_test, y_test)
print(bestModel_score)

0.9160298449027704
