In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import PowerTransformer

import statsmodels.api as sm
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.feature_selection import SelectKBest

from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,make_scorer,r2_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import joblib

In [2]:
import random
np.random.seed(42)
random.seed(42)

In [3]:
df_diamonds = pd.read_csv("diamonds.csv")

In [4]:
df_diamond = df_diamonds.drop(columns = "Unnamed: 0")

In [5]:
train_set, test_set = train_test_split(df_diamond, test_size = 0.2)

In [6]:
train_features = train_set.drop(columns = "price")
train_labels = pd.DataFrame(train_set["price"])

test_features = test_set.drop(columns = "price")
test_labels = pd.DataFrame(test_set["price"])


In [7]:
label_process_ss = StandardScaler()
train_labels_ss = label_process_ss.fit_transform(train_labels)
test_labels_ss = label_process_ss.transform(test_labels)
   
#Make them dataframe
df_train_labels_ss = pd.DataFrame(train_labels_ss, columns = ["price"])
df_test_labels_ss  = pd.DataFrame(test_labels_ss, columns = ["price"])

In [8]:
#3.3.2
from sklearn.preprocessing import PolynomialFeatures

In [9]:
#Pipeline only cat
num_attributes = ["carat","depth","table","x","y","z"]
cat_cut_level = [['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']] 
cat_color_level = [['J', 'I', 'H', 'G', 'F', 'E', 'D']]
cat_clarity_level = [['I1', 'SI2', 'SI1', 'VS2','VS1', 'VVS2','VVS1', 'IF']]
cat_attributes= ["cut","color","clarity"]

full_pipeline_cat = ColumnTransformer(
    transformers = [
    ("cat_cut",     OrdinalEncoder(categories=cat_cut_level),["cut"]),
    ("cat_color",   OrdinalEncoder(categories=cat_color_level),["color"]),
    ("cat_clarity", OrdinalEncoder(categories=cat_clarity_level),["clarity"])
    ],
    remainder = "passthrough"
)
    
train_features_cat = full_pipeline_cat.fit_transform(train_features)
test_features_cat = full_pipeline_cat.transform(test_features)

if(full_pipeline_cat.sparse_output_):
    train_features_cat = train_features_cat.toarray()
    test_features_cat = test_features_cat.toarray()
    
#Make them dataframe
df_train_features_cat = pd.DataFrame(train_features_cat, columns =  cat_attributes + num_attributes )
df_test_features_cat = pd.DataFrame(test_features_cat, columns = cat_attributes + num_attributes )

In [10]:
num_attributes = ["carat","depth","table","x","y","z"]
cat_attributes= ["cut","color","clarity"]
all_attributes = cat_attributes + num_attributes


numeric_transformer = Pipeline(steps=[
    ("poly_feat",PolynomialFeatures(degree = 2,include_bias = False)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, all_attributes),
    ],
    remainder = "passthrough"
)

pipeline_poly = Pipeline(steps=[("preprocesser", preprocessor), ("model", LinearRegression())])

param_grid = [
     #{
      #    "preprocesser__num_transform__poly_feat__degree":[1,2,3,4,5,6],
     #     "model":[LinearRegression()]
     #},
     #{
     #     "preprocesser__num_transform__poly_feat__degree":[1,2,3,4,5,6],
     #     "model":[Lasso()],
     #     "model__alpha":[1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5,1e6]
     #},
     {
          "preprocesser__num_transform__poly_feat__degree":[2,3,4],
          "model":[Ridge()],
          "model__alpha":[1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5]
     }

]
grid_search = GridSearchCV(pipeline_poly, param_grid, cv=KFold(n_splits=10,shuffle = True, random_state = 42), verbose=10,n_jobs=-1,scoring = 'neg_root_mean_squared_error')
grid_search.fit(df_train_features_cat, df_train_labels_ss)

joblib.dump(grid_search, 'poly_grid_search.pkl')

Fitting 10 folds for each of 33 candidates, totalling 330 fits


['poly_grid_search.pkl']

In [11]:
#Testing
grid_search = joblib.load('poly_grid_search.pkl')

df_g_s2 = pd.DataFrame(grid_search.cv_results_)
sorted_g_s2 = df_g_s2.sort_values(by='rank_test_score', ascending=True)

In [12]:
sorted_g_s2[:20]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__alpha,param_preprocesser__num_transform__poly_feat__degree,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
24,0.386954,0.027963,0.027419,0.007221,Ridge(alpha=1000.0),1000.0,2,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.210629,-0.203316,...,-0.200981,-0.213655,-0.246422,-0.212899,-0.205115,-0.258422,-0.217659,-0.218255,0.017979,1
21,0.406931,0.017037,0.021901,0.005375,Ridge(alpha=1000.0),100.0,2,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.200691,-0.188892,...,-0.198093,-0.200477,-0.221536,-0.198789,-0.192928,-0.393339,-0.204,-0.220046,0.05833,2
28,1.172191,0.061412,0.041463,0.006702,Ridge(alpha=1000.0),10000.0,3,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.201612,-0.194066,...,-0.337729,-0.20387,-0.231426,-0.203842,-0.197709,-0.289138,-0.20729,-0.226837,0.045639,3
25,1.287699,0.06933,0.038882,0.007363,Ridge(alpha=1000.0),1000.0,3,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.178279,-0.168734,...,-0.406306,-0.180359,-0.185358,-0.177668,-0.172231,-0.449563,-0.178865,-0.227288,0.100878,4
18,0.405426,0.023751,0.028577,0.00491,Ridge(alpha=1000.0),10.0,2,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.199636,-0.185716,...,-0.255402,-0.198402,-0.208209,-0.195924,-0.190981,-0.737718,-0.194989,-0.256467,0.161471,5
27,0.377319,0.048931,0.020843,0.003196,Ridge(alpha=1000.0),10000.0,2,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.252612,-0.256733,...,-0.278445,-0.263431,-0.283726,-0.263669,-0.258572,-0.306182,-0.26414,-0.268964,0.015319,6
22,1.188118,0.078229,0.040181,0.00776,Ridge(alpha=1000.0),100.0,3,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.173024,-0.159748,...,-0.470413,-0.171928,-0.175492,-0.167577,-0.161658,-0.950034,-0.168097,-0.276303,0.242064,7
31,1.232992,0.075343,0.037764,0.005448,Ridge(alpha=1000.0),100000.0,3,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.245933,-0.247819,...,-0.453421,-0.253469,-0.274682,-0.255327,-0.253213,-0.395274,-0.259165,-0.289172,0.069222,8
32,5.309045,0.087695,0.084937,0.014086,Ridge(alpha=1000.0),100000.0,4,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.208075,-0.200119,...,-0.973567,-0.208119,-0.235907,-0.209611,-0.206511,-0.365164,-0.216441,-0.303039,0.228339,9
29,6.319579,0.166137,0.186122,0.013288,Ridge(alpha=1000.0),10000.0,4,"{'model': Ridge(alpha=1000.0), 'model__alpha':...",-0.180224,-0.169745,...,-1.143945,-0.180551,-0.18306,-0.178473,-0.174292,-0.588483,-0.18059,-0.315414,0.302135,10


In [13]:
best_degree = sorted_g_s2.iloc[0].param_preprocesser__num_transform__poly_feat__degree
best_alpha = sorted_g_s2.iloc[0].param_model__alpha 

In [14]:
sorted_g_s2.iloc[0]

mean_fit_time                                                                                    0.386954
std_fit_time                                                                                     0.027963
mean_score_time                                                                                  0.027419
std_score_time                                                                                   0.007221
param_model                                                                           Ridge(alpha=1000.0)
param_model__alpha                                                                                 1000.0
param_preprocesser__num_transform__poly_feat__degree                                                    2
params                                                  {'model': Ridge(alpha=1000.0), 'model__alpha':...
split0_test_score                                                                               -0.210629
split1_test_score                             

In [15]:
num_attributes = ["carat","depth","table","x","y","z"]
cat_attributes= ["cut","color","clarity"]
all_attributes = cat_attributes + num_attributes


numeric_transformer = Pipeline(steps=[
    ("poly_feat",PolynomialFeatures(degree = best_degree,include_bias = False)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, all_attributes),
    ],
    remainder = "passthrough"
)

train_features_poly = preprocessor.fit_transform(df_train_features_cat)
test_features_poly = preprocessor.transform(df_test_features_cat)

if(preprocessor.sparse_output_):
    train_features_poly = train_features_poly.toarray()
    test_features_poly = test_features_poly.toarray()
    
#Make them dataframe
df_train_features_poly = pd.DataFrame(train_features_poly)
df_test_features_poly = pd.DataFrame(test_features_poly)

In [16]:
poly = PolynomialFeatures(degree = best_degree,include_bias = False)
gather_names = poly.fit_transform(df_train_features_cat)
powers_array = poly.powers_

powers_df = pd.DataFrame(powers_array, columns = df_train_features_cat.columns)
powers_df

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,1
9,2,0,0,0,0,0,0,0,0


In [17]:
f_selector = SelectKBest(f_regression,k = "all")
f_selector.fit_transform(df_train_features_poly,np.array(df_train_labels_ss).ravel())
powers_f_score = powers_df.copy()
powers_f_score['f_score'] = f_selector.scores_


In [18]:
powers_f_score.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,f_score
0,1,0,0,0,0,0,0,0,0,131.765
1,0,1,0,0,0,0,0,0,0,1303.003535
2,0,0,1,0,0,0,0,0,0,944.679968
3,0,0,0,1,0,0,0,0,0,243413.123241
4,0,0,0,0,1,0,0,0,0,1.345108


In [19]:
powers_f_sorted = powers_f_score.sort_values(by=['f_score'], ascending=False)
powers_f_sorted

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,f_score
36,0,0,0,1,0,0,1,0,0,248978.052771
3,0,0,0,1,0,0,0,0,0,243413.123241
38,0,0,0,1,0,0,0,0,1,236356.451542
34,0,0,0,1,1,0,0,0,0,230977.43823
37,0,0,0,1,0,0,0,1,0,228559.603766
35,0,0,0,1,0,1,0,0,0,225005.529883
48,0,0,0,0,0,0,2,0,0,203784.071083
50,0,0,0,0,0,0,1,0,1,189191.574219
49,0,0,0,0,0,0,1,1,0,181333.434011
33,0,0,0,2,0,0,0,0,0,166862.826563


In [20]:
mut_info_selector = SelectKBest(mutual_info_regression, k = "all")
mut_info_selector.fit_transform(df_train_features_poly,np.array(df_train_labels_ss).ravel()) # accepts row vector
powers_mi_score = powers_df.copy()
powers_mi_score['mi_score'] = mut_info_selector.scores_


In [21]:
powers_mi_score.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,mi_score
0,1,0,0,0,0,0,0,0,0,0.053732
1,0,1,0,0,0,0,0,0,0,0.13758
2,0,0,1,0,0,0,0,0,0,0.214539
3,0,0,0,1,0,0,0,0,0,1.634221
4,0,0,0,0,1,0,0,0,0,0.03236


In [22]:
powers_mi_sorted =powers_mi_score.sort_values(by=['mi_score'], ascending=False)
powers_mi_sorted

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,mi_score
3,0,0,0,1,0,0,0,0,0,1.634221
33,0,0,0,2,0,0,0,0,0,1.630034
37,0,0,0,1,0,0,0,1,0,1.557618
36,0,0,0,1,0,0,1,0,0,1.536699
35,0,0,0,1,0,1,0,0,0,1.489833
38,0,0,0,1,0,0,0,0,1,1.48912
27,0,0,1,1,0,0,0,0,0,1.470276
51,0,0,0,0,0,0,0,2,0,1.412635
52,0,0,0,0,0,0,0,1,1,1.411098
7,0,0,0,0,0,0,0,1,0,1.411085


In [23]:
num_attributes = ["carat","depth","table","x","y","z"]
cat_attributes= ["cut","color","clarity"]
all_attributes = cat_attributes + num_attributes


numeric_transformer = Pipeline(steps=[
    ("poly_feat",PolynomialFeatures(degree = best_degree,include_bias = False)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, all_attributes),
    ],
    remainder = "passthrough"
)

pipeline_poly = Pipeline(steps=[("preprocesser", preprocessor), ("model", Ridge(alpha=best_alpha))])
pipeline_poly.fit(df_train_features_cat, df_train_labels_ss)

Pipeline(steps=[('preprocesser',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num_transform',
                                                  Pipeline(steps=[('poly_feat',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['cut', 'color', 'clarity',
                                                   'carat', 'depth', 'table',
                                                   'x', 'y', 'z'])])),
                ('model', Ridge(alpha=1000.0))])

In [24]:
preds = pipeline_poly.predict(df_test_features_cat)
print("R2: Performance on test dataset:", r2_score(np.array(df_test_labels_ss).reshape(-1,1), preds))
print("RMSE: Performance on test dataset:",mean_squared_error(df_test_labels_ss, preds, squared=False))

R2: Performance on test dataset: 0.9519841228606234
RMSE: Performance on test dataset: 0.21896731503022598
