Fit the dimonds datasets into a regression model to see if different properties can predict the price.

In [1]:
import pandas as pd
df = pd.read_csv("datasets/diamonds.csv", index_col = 0) #index_col = 0 so that we don't generate duplicate indexes
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


convert everything into numerical values

In [2]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
df['cut'].astype("category").cat.codes #arbitary classes

1        2
2        3
3        1
4        3
5        1
6        4
7        4
8        4
9        0
10       4
11       1
12       2
13       3
14       2
15       3
16       3
17       2
18       1
19       1
20       4
21       1
22       4
23       4
24       4
25       4
26       4
27       3
28       4
29       4
30       4
        ..
53911    3
53912    3
53913    3
53914    1
53915    1
53916    2
53917    1
53918    4
53919    3
53920    2
53921    4
53922    4
53923    4
53924    2
53925    2
53926    2
53927    2
53928    1
53929    3
53930    2
53931    3
53932    3
53933    4
53934    4
53935    3
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [4]:
# create a dictionary
cut_class_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
clarity_dict = {"I3": 1, 'I2': 2, 'I1':3, 'SI2':4, 'SI1':5, 'VS2': 6, 'VS1': 7, 'VVS2': 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J":1, "I":2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

In [5]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)

In [6]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [12]:
import sklearn
from sklearn import svm, preprocessing

First thing is to shuffle the datasets if they are sorted in any way.

In [13]:
df = sklearn.utils.shuffle(df)

X = df.drop("price", axis = 1).values
# scale your data
X = preprocessing.scale(X)
y = df['price'].values

In [14]:
X

array([[ 0.25750736,  0.08588908, -0.8264134 , ...,  0.59624924,
         0.53888544,  0.3135449 ],
       [-0.20662095, -0.80969515, -1.41427211, ..., -0.05451945,
        -0.00396275, -0.02654667],
       [-1.02939387, -0.80969515, -0.8264134 , ..., -1.26691043,
        -1.20348215, -1.1743557 ],
       ...,
       [-0.81842646, -0.80969515,  0.34930404, ..., -0.87466629,
        -0.83574628, -0.7775822 ],
       [ 0.21531388,  0.08588908, -2.00213083, ...,  0.37338325,
         0.30248381,  0.39856779],
       [-1.0926841 , -0.80969515,  0.93716275, ..., -1.38280074,
        -1.38735009, -1.28771955]])

In [15]:
test_size = 200
X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [16]:
#clf stands for classifier
clf = svm.SVR(kernel = "linear")
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
clf.score(X_test, y_test) # R-squred

0.8693292902633908

In [20]:
for X,y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Acutal: {y}")

Model: 5103.999583281298, Acutal: 5497
Model: -111.64802379724233, Acutal: 544
Model: 1008.9507786829331, Acutal: 1094
Model: 1601.045018097181, Acutal: 1746
Model: 9251.973162418368, Acutal: 12981
Model: 8784.91646196959, Acutal: 7811
Model: 2573.863418593498, Acutal: 2343
Model: 752.2174569518534, Acutal: 642
Model: 1516.1474234610419, Acutal: 1624
Model: 635.063463737245, Acutal: 906
Model: 8412.24158836202, Acutal: 9494
Model: 6069.123855600466, Acutal: 6180
Model: 323.9434223201938, Acutal: 603
Model: 2922.8988749966493, Acutal: 2818
Model: 2498.8896055031964, Acutal: 2120
Model: 534.7709719539002, Acutal: 928
Model: 1001.803152659807, Acutal: 845
Model: 6364.13850520183, Acutal: 8858
Model: 4346.922637143377, Acutal: 4315
Model: 12227.07666645158, Acutal: 16914
Model: 2763.0488702003577, Acutal: 2515
Model: 1065.5348843319493, Acutal: 810
Model: 759.2667797596491, Acutal: 591
Model: 959.1726561276628, Acutal: 1000
Model: 947.7322205560381, Acutal: 1035
Model: 2166.482789353596, A

In [21]:
clf2 = svm.SVR(kernel = 'rbf')
clf2.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
for X, y in zip(X_test, y_test):
    print(f"Model: {clf2.predict([X])[0]}, Acutal: {y}")

Model: 4193.778233488596, Acutal: 5497
Model: 966.567719541325, Acutal: 544
Model: 959.0002078646398, Acutal: 1094
Model: 1261.3734623381374, Acutal: 1746
Model: 6572.504013660244, Acutal: 12981
Model: 6528.014623295886, Acutal: 7811
Model: 2749.612025914901, Acutal: 2343
Model: 1221.9631460802466, Acutal: 642
Model: 1449.6693852927326, Acutal: 1624
Model: 691.1189060995453, Acutal: 906
Model: 6825.251466576901, Acutal: 9494
Model: 6195.554785244762, Acutal: 6180
Model: 680.2716884376514, Acutal: 603
Model: 2769.5547726707036, Acutal: 2818
Model: 2793.2998687591244, Acutal: 2120
Model: 677.3672638808052, Acutal: 928
Model: 1224.1896531490893, Acutal: 845
Model: 6290.473689423823, Acutal: 8858
Model: 3930.6940833757867, Acutal: 4315
Model: 5189.019110252065, Acutal: 16914
Model: 2583.099278908936, Acutal: 2515
Model: 1229.723210842576, Acutal: 810
Model: 1094.2730320115847, Acutal: 591
Model: 1381.5999753173728, Acutal: 1000
Model: 996.7146336219162, Acutal: 1035
Model: 2047.16313288289

no negatives this time, but the score is way off. That's why voting models gives way better scores.

In [23]:
clf2.score(X_test, y_test)

0.5616380115387061