In [38]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [39]:
data_frame = pd.read_csv('./Diamonds Prices2022.csv',index_col=0)
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53943 entries, 1 to 53943
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53943 non-null  float64
 1   cut      53943 non-null  object 
 2   color    53943 non-null  object 
 3   clarity  53943 non-null  object 
 4   depth    53943 non-null  float64
 5   table    53943 non-null  float64
 6   price    53943 non-null  int64  
 7   x        53943 non-null  float64
 8   y        53943 non-null  float64
 9   z        53943 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


In [40]:
data_frame

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53940,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
53941,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53942,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43


In [41]:
#Label encoding for variable cut, it will order them in the order - Fair - the worst cut, Premium - the best cut
cut_names =['Fair', 'Good', 'Very Good', 'Ideal','Premium']
cut_labels = pd.factorize(cut_names)[0]
data_frame['cut'] = data_frame['cut'].map(dict(zip(cut_names, cut_labels)))

In [42]:
#Label encoding, we can also encode the colors of diamonds using Label endocding,
# because with the variable color we can order theoretically the most valuable diamonds are those with color D, 
# and the least valuable are those with color J, between them we keep the alphabetical order
color_names =['J', 'I', 'H', 'G','F','E','D']
color_labels = pd.factorize(color_names)[0]
data_frame['color'] = data_frame['color'].map(dict(zip(color_names, color_labels)))

In [43]:
#We can also order this categorical variable, since diamonds have a clarity scale 
clarity_names =['I1', 'VVS1','VVS2', 'VS1', 'VS2','SI1','SI2','IF']
clarity_labels = pd.factorize(clarity_names)[0]
data_frame['clarity'] = data_frame['clarity'].map(dict(zip(clarity_names, clarity_labels)))

In [44]:
model = LinearRegression()
scaler = StandardScaler()

In [49]:
#scale the numerical data
final_df = data_frame.copy()
final_df[["carat", "depth", "table", "price", 'x', 'y', 'z']] = scaler.fit_transform(data_frame
                                                                                     [["carat", "depth", "table", "price", 'x', 'y', 'z']])

### Forward selection

In [55]:
#create targets and features
y_name = "price"
attr_names = list(final_df.drop('price', axis=1).columns)
all_possible = attr_names.copy()

In [51]:
tolerance = 0.01

In [53]:
attr_names = list(final_df.drop('price', axis=1).columns)
current_names = []
last_score = 0 
cur_diff = 1
while current_names != all_possible and cur_diff > tolerance:
        scores = []
        for name in attr_names:
            model.fit(final_df[current_names + [name]], final_df[y_name])
            score = model.score(final_df[current_names + [name]], final_df[y_name])
            scores.append((name, score))
        
        scores_val = [item[1] for item in scores]
        best = np.argmax(scores_val)
        chosen_col = scores[best][0]
        current_names.append(chosen_col)
        cur_diff = scores[best][1] - last_score
        attr_names.remove(chosen_col)
        last_score = scores[best][1]

        print(scores, scores[best], current_names, sep='\nChosen feature(s):')

[('carat', 0.8493304833200088), ('cut', 0.0008313413672876857), ('color', 0.029763033875571), ('clarity', 0.012569365746941896), ('depth', 0.0001129984176113652), ('table', 0.01615895063447781), ('x', 0.782221323509542), ('y', 0.7489494819516715), ('z', 0.7417495130634612)]
Chosen feature(s):('carat', 0.8493304833200088)
Chosen feature(s):['carat']
[('cut', 0.8515060145547461), ('color', 0.8594162644199514), ('clarity', 0.8564956314037662), ('depth', 0.8506748265259675), ('table', 0.8510053541832346), ('x', 0.8534319855784495), ('y', 0.8507780949229875), ('z', 0.8526496705185767)]
Chosen feature(s):('color', 0.8594162644199514)
Chosen feature(s):['carat', 'color']
[('cut', 0.8617239766294208), ('clarity', 0.8681188512689051), ('depth', 0.8604781949174541), ('table', 0.861331930712862), ('x', 0.8644252343082075), ('y', 0.8612490425523843), ('z', 0.8631368171768955)]
Chosen feature(s):('clarity', 0.8681188512689051)
Chosen feature(s):['carat', 'color', 'clarity']


As a result of forward selection, the selected variables are "Carat", "Color" and "Clarity"
### Backward selection

In [54]:
y_name = 'price'
X_names = final_df.drop('price', axis=1)


attr_names = list(final_df.drop('price', axis=1).columns)

tolerance = 0.01
last_score = 0
cur_diff = 0 # starting value
i = 0
while cur_diff < tolerance: # condition
    scores = []
    for name in attr_names:
        model.fit(X_names.drop(name, axis=1), final_df[y_name])
        score = model.score(X_names.drop(name, axis=1), final_df[y_name])
        scores.append((name, score))

    scores_val = [item[1] for item in scores]
    best = np.argmax(scores_val)
    chosen_col = scores[best][0]
    cur_diff = last_score - scores[best][1] # kolejność
    if cur_diff >= tolerance:
        print(f'Model is most optimal when {attr_names} used')
        break
    X_names = X_names.drop(chosen_col, axis=1)
    attr_names.remove(chosen_col)
    last_score = scores[best][1]
    i += 1

    print(chosen_col, cur_diff, last_score)

print('='*100)

z -0.8779738325982578 0.8779738325982578
y 3.615273190782364e-05 0.87793767986635
cut 0.0009312118132327463 0.8770064680531172
table 0.0026951859537250877 0.8743112820993921
depth 0.002359311827999977 0.8719519702713922
x 0.003833119002487062 0.8681188512689051
clarity 0.008702586848953642 0.8594162644199514
Model is most optimal when ['carat', 'color'] used


As a result of backward selection, the selected variables are "Carat" and "Color".