In [1]:
# Another basic test of sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('diamonds.csv')

In [3]:
data.describe()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,26970.5,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,15571.281097,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,1.0,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,13485.75,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,26970.5,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,40455.25,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,53940.0,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [4]:
data.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

In [5]:
# print the categorical variables
print(data['cut'].unique())
print(data['color'].unique())
print(data['clarity'].unique())

['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']
['E' 'I' 'J' 'H' 'F' 'G' 'D']
['SI2' 'SI1' 'VS1' 'VS2' 'VVS2' 'VVS1' 'I1' 'IF']


In [6]:
# transform categorical features to numbers so sklearn can work with them
data = pd.concat([data, pd.get_dummies(data['cut'], prefix = 'cut', drop_first = True)], axis = 1)
data = pd.concat([data, pd.get_dummies(data['color'], prefix = 'color', drop_first = True)], axis = 1)
data = pd.concat([data, pd.get_dummies(data['clarity'], prefix = 'clarity', drop_first = True)], axis = 1)
data.drop(['cut', 'color', 'clarity'], axis = 1, inplace = True)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,price,x,y,z,cut_Good,cut_Ideal,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1,0.23,61.5,55.0,326,3.95,3.98,2.43,0,1,...,0,0,0,0,0,1,0,0,0,0
1,2,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,0.23,56.9,65.0,327,4.05,4.07,2.31,1,0,...,0,0,0,0,0,0,1,0,0,0
3,4,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,...,0,1,0,0,0,0,0,1,0,0
4,5,0.31,63.3,58.0,335,4.34,4.35,2.75,1,0,...,0,0,1,0,0,1,0,0,0,0


In [9]:
y = data['price']
robust_scaler = RobustScaler()
X = data.drop('price', axis = 1)
X = robust_scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 55)

In [10]:
models = pd.DataFrame(index = ['train_mse', 'test_mse'],
                     columns = ['KNN', 'Bagging', 'RandomForest', 'Boosting'])

In [11]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors = 20, weights = 'distance', metric = 'euclidean', n_jobs=-1)

knn.fit(X_train, y_train)

models.loc['train_mse', 'KNN'] = mean_squared_error(y_pred = knn.predict(X_train),
                                                   y_true = y_train)
models.loc['test_mse', 'KNN'] = mean_squared_error(y_pred = knn.predict(X_test),
                                                  y_true = y_test)

In [13]:
from sklearn.ensemble import BaggingRegressor

knn_for_bagging = KNeighborsRegressor(n_neighbors=20, weights='distance', metric='euclidean')

bagging = BaggingRegressor(base_estimator=knn_for_bagging, n_estimators=15, max_features=.75,
                          random_state=55, n_jobs=-1)

bagging.fit(X_train, y_train)

models.loc['train_mse', 'Bagging'] = mean_squared_error(y_pred=bagging.predict(X_train),
                                                       y_true=y_train)


In [14]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor(n_estimators=50, max_depth=16, random_state=55, n_jobs=-1)

RF.fit(X_train, y_train)

models.loc['train_mse', 'RandomForest'] = mean_squared_error(y_pred=RF.predict(X_train),
                                                            y_true=y_train)
models.loc['test_mse', 'RandomForest'] = mean_squared_error(y_pred=RF.predict(X_test),
                                                           y_true=y_test)