In [2]:
import pandas as pd
import os 

os.chdir("d:\\programs\\ai-ml-da\\pandas-learn\\")
print(os.getcwd())

d:\programs\ai-ml-da\pandas-learn


In [3]:
diamonds_df = pd.read_csv("datasets/diamonds.csv", index_col=0)
diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
print(diamonds_df.shape)
print(diamonds_df.columns)

(53940, 10)
Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')


In [5]:
# see how many different types of cuts there are in the dataset
diamonds_df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [6]:
# see different classes of clarity there are in the dataset
diamonds_df["clarity"].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

## method to let pandas arbitrarily map coded ints to to each category in the df which is 

In [7]:
# Series (1D array)

# diamonds_df["cut"].astype("category").cat.codes

In [8]:
# but we will create codings manually
cut_classes_dict = {"Fair" : 1, "Good" : 2, "Very Good" : 3, "Premium" : 4, "Ideal" : 5}
clarity_classes_dict = {"I1" : 1, "S12" : 2, "SI1" : 3, "VS2" : 4, "VS1" : 5, "VVS2" : 6, "VVS1" : 7, "IF" : 8}
color_classes_dict = {"J" : 1, "I" : 2, "H" : 3, "G" : 4, "F" : 5, "E" : 6, "D" : 7}

# map
diamonds_df["cut"] = diamonds_df["cut"].map(cut_classes_dict)
diamonds_df["clarity"] = diamonds_df["clarity"].map(clarity_classes_dict)
diamonds_df["color"] = diamonds_df["color"].map(color_classes_dict)

In [9]:
diamonds_df.dropna(inplace=True)
diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2,0.21,4,6,3.0,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,5.0,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,4.0,62.4,58.0,334,4.2,4.23,2.63
6,0.24,3,1,6.0,62.8,57.0,336,3.94,3.96,2.48
7,0.24,3,2,7.0,62.3,57.0,336,3.95,3.98,2.47


In [10]:
import sklearn
from sklearn import svm, preprocessing

diamonds_df = sklearn.utils.shuffle(diamonds_df)

# feeding in our feature set
X = diamonds_df.drop("price", axis=1).values # we drop the Price col as we are projecting Price
                                                # so we are getting vals of all cols except Price
Y = diamonds_df["price"].values

In [11]:
X

array([[0.76, 2.  , 4.  , ..., 5.72, 5.8 , 3.72],
       [0.54, 5.  , 4.  , ..., 5.24, 5.26, 3.25],
       [0.2 , 4.  , 7.  , ..., 3.77, 3.72, 2.31],
       ...,
       [1.87, 5.  , 1.  , ..., 7.88, 7.82, 4.89],
       [1.14, 5.  , 3.  , ..., 6.72, 6.74, 4.17],
       [0.72, 4.  , 3.  , ..., 5.73, 5.69, 3.58]])

In [12]:
import numpy as np
print(np.shape(X))

(44746, 9)


In [13]:
# scaling the data using PreProcessing (simplifying data between 0 and 1 - Linear Coding)
X = sklearn.preprocessing.scale(X)
X
# data has successfully been scaled

array([[ 0.0440482 , -1.77057821, -0.23235407, ...,  0.1167346 ,
         0.18602742,  0.38754207],
       [-0.45208536,  0.94945106, -0.23235407, ..., -0.32602859,
        -0.31185816, -0.29861392],
       [-1.21883722,  0.04277464,  1.54307542, ..., -1.68199087,
        -1.73175408, -1.67092591],
       ...,
       [ 2.54726751,  0.94945106, -2.00778355, ...,  2.10916897,
         2.0484883 ,  2.09563253],
       [ 0.90100616,  0.94945106, -0.82416389, ...,  1.03915792,
         1.05271714,  1.04449994],
       [-0.0461579 ,  0.04277464, -0.82416389, ...,  0.12595884,
         0.08460629,  0.18315518]])

In [14]:
test_size = 200

# things the model is gonna fit against (train on)
X_train = X[:-test_size] # [:-test_size] FIRST 200 ROWS
Y_train = Y[:-test_size]

X_test = X[-test_size:]  # LAST 200 ROWS
Y_test = Y[-test_size:] 


# defining the SVR (Support Vector Machine) Classifier:
clf = svm.SVR(kernel="linear")
clf.fit(X_train, Y_train)

In [15]:
print(clf.score(X_test, Y_test))

0.867855798954309


In [22]:
# now lets step through the model and see how it is making predictions along with an accuracy for each test 
for X, Y in zip(X_test, Y_test):
    print(f"Model predicts: {clf.predict([X])[0]}, Actual value: {Y}")

Model predicts: 673.4155223440139, Actual value: 929
Model predicts: 9710.193016534718, Actual value: 12437
Model predicts: 575.9644407146625, Actual value: 873
Model predicts: 3219.7465797962586, Actual value: 2575
Model predicts: 2316.6515240585713, Actual value: 2030
Model predicts: 4707.044863150064, Actual value: 4381
Model predicts: 922.8616323455235, Actual value: 868
Model predicts: 1364.6055853031385, Actual value: 1103
Model predicts: 4374.5454948775705, Actual value: 3863
Model predicts: 3479.7831740646147, Actual value: 3354
Model predicts: 774.15905549121, Actual value: 694
Model predicts: 4617.21925597679, Actual value: 5595
Model predicts: 11886.142790716753, Actual value: 7294
Model predicts: 3909.841870526734, Actual value: 3388
Model predicts: 597.2012681958104, Actual value: 904
Model predicts: 767.9093591417118, Actual value: 895
Model predicts: 2102.6078830398637, Actual value: 1802
Model predicts: 4245.7823289755015, Actual value: 3780
Model predicts: 832.57856019

In [23]:
clf2 = svm.SVR(kernel="rbf")
clf2.fit(X_train, Y_train)

print(clf2.score(X_test, Y_test))


0.6525206306140094


In [25]:
for X, Y in zip(X_test, Y_test):
    print(f"Model predicts: {clf2.predict([X])[0]}, Actual Value: {Y}")

Model predicts: 841.1254186240471, Actual Value: 929
Model predicts: 5277.302858468897, Actual Value: 12437
Model predicts: 934.0414181053247, Actual Value: 873
Model predicts: 2566.7257448818827, Actual Value: 2575
Model predicts: 2013.3836471727066, Actual Value: 2030
Model predicts: 3697.256423280345, Actual Value: 4381
Model predicts: 1101.1000684644678, Actual Value: 868
Model predicts: 1210.186210081032, Actual Value: 1103
Model predicts: 4083.312593298068, Actual Value: 3863
Model predicts: 3249.6779055503907, Actual Value: 3354
Model predicts: 1225.054867435691, Actual Value: 694
Model predicts: 4469.7013036724, Actual Value: 5595
Model predicts: 3328.5122918142006, Actual Value: 7294
Model predicts: 3742.4276373602725, Actual Value: 3388
Model predicts: 848.8087701271843, Actual Value: 904
Model predicts: 1193.7742710125247, Actual Value: 895
Model predicts: 1841.6168113293695, Actual Value: 1802
Model predicts: 3989.2682775774783, Actual Value: 3780
Model predicts: 694.608902

## We can see different types of models for our classifiers give different accuracies, where e.g. In this case kernel=linear regression gives better accuracy than kernel=rbf

## So overall, we covered the basics of analyzing data using pandas so now from here to get a deeper understanding, we should dive deeper into the intricacies behind how these models work and the algorithms used to create them.