In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset/diamonds.csv", index_col = 0)

In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [5]:
df['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [6]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [7]:
cut_class_dict = {'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5}

In [8]:
clarity_dict = {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

In [9]:
color_dict = {'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}

In [10]:
df['cut'] = df['cut'].map(cut_class_dict)

In [11]:
df['clarity'] = df['clarity'].map(clarity_dict)

In [12]:
df['color'] = df['color'].map(color_dict)

In [13]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,3,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,5,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,4,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,2,63.3,58.0,335,4.34,4.35,2.75


In [14]:
import sklearn

In [15]:
from sklearn import svm, preprocessing

In [16]:
df = sklearn.utils.shuffle(df)

In [17]:
X = df.drop("price", axis = 1).values

In [18]:
X = preprocessing.scale(X)

In [19]:
y = df['price'].values

In [20]:
test_size = 200

In [21]:
X_train = X[:-test_size]

In [22]:
y_train = y[:-test_size]

In [23]:
X_test = X[-test_size:]

In [24]:
y_test = y[-test_size:]

In [25]:
clf = svm.SVR(kernel = "linear")

In [26]:
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
clf.score(X_test, y_test)

0.8755270509552836

In [28]:
for X,y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual:{y}")

Model: 4693.166229608753, Actual:3718
Model: 450.1891951000207, Actual:900
Model: 599.4730316188657, Actual:789
Model: 190.28981043865133, Actual:802
Model: 1612.785267289528, Actual:1577
Model: 3251.1249387785842, Actual:3247
Model: 625.1167858566564, Actual:752
Model: -237.41292766609013, Actual:628
Model: 5889.947087156641, Actual:4773
Model: 1915.353474376255, Actual:1689
Model: 8949.33393304845, Actual:14220
Model: 8413.880048152432, Actual:7953
Model: 1138.621827427014, Actual:1064
Model: 187.94577035184602, Actual:625
Model: 3047.4208206772482, Actual:2232
Model: 1707.2717250880514, Actual:1791
Model: 3531.970016358365, Actual:3136
Model: 1657.002240367134, Actual:1746
Model: 3101.0748412058765, Actual:2553
Model: 5969.235788093933, Actual:9160
Model: 388.72494248420116, Actual:854
Model: 99.46884344106411, Actual:612
Model: 2755.23803261363, Actual:1998
Model: 708.8861894506854, Actual:905
Model: 6216.51678135677, Actual:5398
Model: -504.33057620794807, Actual:523
Model: 10683.

In [29]:
clf = svm.SVR(kernel = 'rbf')

In [30]:
clf.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
clf.score(X_test, y_test)

0.662374833326965

In [32]:
for X,y in zip(X_test, y_test):
    print(f"Model:{clf.predict([X])[0]}, Actual:{y}")

Model:4240.805326180425, Actual:3718
Model:817.1971360255288, Actual:900
Model:749.7028493776188, Actual:789
Model:381.05911872699653, Actual:802
Model:1267.0642248844233, Actual:1577
Model:3033.750724144264, Actual:3247
Model:622.8992824232282, Actual:752
Model:1477.4343194122348, Actual:628
Model:5034.6942248047, Actual:4773
Model:1841.0016323722975, Actual:1689
Model:6931.153423655154, Actual:14220
Model:5531.987121140044, Actual:7953
Model:1165.9898703339786, Actual:1064
Model:466.9636054043058, Actual:625
Model:2935.545948422638, Actual:2232
Model:1858.5576992778642, Actual:1791
Model:3480.718982510479, Actual:3136
Model:1277.855914841968, Actual:1746
Model:3037.2040964744765, Actual:2553
Model:5891.719598917961, Actual:9160
Model:582.4746786738865, Actual:854
Model:1122.4048329107022, Actual:612
Model:2698.9776585063487, Actual:1998
Model:936.286526413393, Actual:905
Model:5301.062007129565, Actual:5398
Model:783.0634142409235, Actual:523
Model:4769.492851664562, Actual:6632
Mode