In [1]:
"""Machine Learning with Scikit-learn - Data Analysis with Python 3 and Pandas"""
import pandas as pd 
df = pd.read_csv("diamonds.csv",index_col=0)
df.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.5
53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53938,0.7,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53940,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64


In [10]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [11]:
#in machine learning everything has to be in numerical values
#takes the amount of uniques there are and assigns it codes bad on that, starting from 0 1 2 3 and so on
df["cut"].astype("category").cat.codes
#but since in our cuts there's different types and orders of it, we want to keep it 

1        2
2        3
3        1
4        3
5        1
        ..
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [12]:
#what we're mapping to our columns
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [16]:
import sklearn 
from sklearn import svm, preprocessing

#when training a model it is always a good idea to shuffle the data first
df = sklearn.utils.shuffle(df)

"""
Scaling can help simplify our data for our model and so our model can digest it better
"""

#In machine learning, the standard is typically featuresets are stored as a capital X and labels as a lowercase y.
#featureset: the list of features that points to the labels, basically everything except for pirce 
#labels: the price in this case because we are traning a model for price
#similar to sentiment analysis, where we label our texts as either as neg or pos cuz we're looking for the sentiment
X=df.drop("price",axis=1).values
X=preprocessing.scale(X)
y=df['price'].values

test_size=200

X_train=X[:-test_size] #up to the last 200
y_train=y[:-test_size]
#we are testing it on datas that wasn't in the training datas
X_test=X[-test_size:] #last 200
y_test=y[-test_size:]

clf = svm.SVR(kernel="linear")
#training
clf.fit(X_train,y_train)

# df['cut'].values

SVR(kernel='linear')

In [15]:
X
#if we hadn't done index_col=0 then the index would have been one of columns and it could mess up our data

array([[0.61, 5.  , 5.  , ..., 5.38, 5.44, 3.37],
       [0.51, 5.  , 5.  , ..., 5.14, 5.15, 3.17],
       [0.4 , 2.  , 6.  , ..., 4.69, 4.65, 2.97],
       ...,
       [0.31, 5.  , 6.  , ..., 4.35, 4.39, 2.68],
       [0.41, 5.  , 4.  , ..., 4.81, 4.84, 2.97],
       [0.9 , 2.  , 3.  , ..., 6.11, 6.09, 3.88]])

In [17]:
#0: bad, 1:good
clf.score(X_test,y_test)

0.8256020879842636

In [19]:
#seeing the actual predictions made
for X,y in list(zip(X_test,y_test)):
    #in predict always pass a list and it will always return a list, even if only 1 thing was wanted
    #X is the what the computer predicted, y is the actual thing
    print(f"Model: {clf.predict([X])[0]}, Actual:{y}")

Model: 5516.083550708801, Actual:3964
Model: 11281.856974039298, Actual:6002
Model: 4650.503495336751, Actual:3669
Model: 4919.378439301233, Actual:4578
Model: 3164.2841766384527, Actual:2657
Model: 4407.737665014716, Actual:3977
Model: -351.96142675384954, Actual:412
Model: 4348.640458159056, Actual:4788
Model: 8556.800566115911, Actual:10935
Model: -73.27264555673719, Actual:596
Model: 3181.1025241236816, Actual:2482
Model: 2203.7325993520944, Actual:1942
Model: 366.7732706629836, Actual:625
Model: 822.8047598072767, Actual:1013
Model: 3281.0155226736497, Actual:3615
Model: 1093.4475157872325, Actual:764
Model: 63.15372744774322, Actual:472
Model: 3253.7854345434057, Actual:3553
Model: 5721.609848871403, Actual:4542
Model: 281.9608077811381, Actual:743
Model: 189.4843242811262, Actual:573
Model: 2528.9119365036613, Actual:2189
Model: 12028.516817783724, Actual:14837
Model: 3380.458747013714, Actual:2867
Model: 8194.882019607572, Actual:11956
Model: 2567.0772574357243, Actual:1985
Mod

In [20]:
clf = svm.SVR(kernel="rbf")
#training
clf.fit(X_train,y_train)

SVR()

In [22]:
print(clf.score(X_test,y_test))
for X,y in list(zip(X_test,y_test)):
    print(f"Model: {clf.predict([X])[0]}, Actual:{y}")

0.5381994566260571
Model: 4509.797593923654, Actual:3964
Model: 4630.27796417502, Actual:6002
Model: 4079.248362625749, Actual:3669
Model: 4707.847323018979, Actual:4578
Model: 2902.486691389991, Actual:2657
Model: 4057.6425162994674, Actual:3977
Model: 1663.1139137599469, Actual:412
Model: 4031.961557882383, Actual:4788
Model: 6296.391289182459, Actual:10935
Model: 1275.0602092612203, Actual:596
Model: 3073.786585513257, Actual:2482
Model: 2149.4195241176276, Actual:1942
Model: 498.87587960091105, Actual:625
Model: 1216.3666371218824, Actual:1013
Model: 3082.2437208885426, Actual:3615
Model: 1008.2617464914892, Actual:764
Model: 791.1472695265411, Actual:472
Model: 3121.901208758623, Actual:3553
Model: 3916.6366001058354, Actual:4542
Model: 841.1963034833138, Actual:743
Model: 569.6226392542176, Actual:573
Model: 2150.116782760435, Actual:2189
Model: 6534.5615714923515, Actual:14837
Model: 3258.7052261783747, Actual:2867
Model: 7146.607635791175, Actual:11956
Model: 2749.787795451729,