In [3]:
import pandas as pd
podaci = pd.read_csv('skupovi_podataka/diamonds.csv')
podaci.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
podaci["cut"].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [5]:
podaci["color"].value_counts()

G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: color, dtype: int64

In [6]:
podaci["clarity"].value_counts()

SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: clarity, dtype: int64

Skup podataka sadrži kategoričke podatke (cut, color, clarity), a mi s njima ne možemo raditi kao sa  numeričkim podacima pa moramo svakoj vrijednosti dodijeliti neki broj (npr. Ideal = 1, Premium = 2, Good = 3, ...), odnosno moramo ih pretvoriti u kategoričke varijable.

Bitno je shvatiti da ovi brojevi nisu obične numeričke vrijednosti nego samo predstavljaju pripadajuću klasu. Kako bi sačuvali stare podatke nećemo mijenjati postojeće stupce nego ćemo dodati nove stupce sa sufiksom _cat.

In [7]:
podaci["cut_cat"] = podaci["cut"].astype('category').cat.codes
podaci["color_cat"] = podaci["color"].astype('category').cat.codes
podaci["clarity_cat"] = podaci["clarity"].astype('category').cat.codes
podaci.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z,cut_cat,color_cat,clarity_cat
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,2,1,3
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,3,1,2
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1,1,4
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,3,5,5
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1,6,3


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(podaci[['carat', 'depth', 'table', 'x', 'y', 'z', 'color_cat', 'cut_cat', 'clarity_cat']], podaci[['price']], random_state=2)

X_train.head()

Unnamed: 0,carat,depth,table,x,y,z,color_cat,cut_cat,clarity_cat
20404,1.51,62.6,59.0,7.31,7.33,4.58,6,3,7
14626,1.01,59.1,59.0,6.5,6.59,3.87,4,4,4
30081,0.32,62.6,54.0,4.41,4.37,2.75,5,2,7
8217,1.13,61.9,56.0,6.67,6.71,4.14,6,2,3
32762,0.34,59.6,59.0,4.57,4.55,2.72,0,3,2


In [13]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8853629358300954

In [11]:
y_test[:3]

Unnamed: 0,price
30960,746
50149,2215
28326,666


In [10]:
lr.predict(X_test[:3]) # usporedi sa y_test[:3] (stvarnim vrijednostima)

array([[ 853.24033601],
       [3233.85414092],
       [ 176.28072324]])

In [12]:
from sklearn.linear_model import Ridge

rid = Ridge(alpha=100) # Isprobaj alpha = 0.01, = 10, = 1000
rid.fit(X_train, y_train)
rid.score(X_test, y_test)

0.8826339235233143

In [14]:
rid.predict(X_test[:3]) 

array([[ 860.54378928],
       [3358.27104445],
       [  48.70640067]])

In [15]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor(n_neighbors=2)
knr.fit(X_train, y_train)
knr.score(X_test, y_test)

0.9425523720203168

In [16]:
knr.predict(X_test[:3]) 

array([[ 544.5],
       [2274.5],
       [ 736.5]])

In [18]:
# Neuronska mreža (Multilayer Perceptron)
# U nastavku ćemo raditi sa dosta robusnijom NN.

from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=10, alpha=1)
mlp.fit(X_train, y_train)
mlp.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.1856087456349681

In [19]:
mlp.predict(X_test[:3]) 

array([3756.8263517 , 3820.86551979, 3590.81948802])

In [20]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=500, alpha=1) # Povećavamo na 500 iteracija
mlp.fit(X_train, y_train)
mlp.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.9362574472299919

In [21]:
mlp.predict(X_test[:3]) # Puno bolji rezultati.

array([ 975.243583  , 2829.8872757 ,  660.24800343])