# PLS Model

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.metrics import mean_squared_error

In [3]:
data_file = "Hitters.csv"

In [4]:
df = pd.read_csv(data_file).dropna()
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [5]:
x64 = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64")
y = df["Salary"]
dummies = pd.get_dummies(df[["League", "Division", "NewLeague"]])
X = pd.concat([x64, dummies[["League_N", "Division_W", "NewLeague_N"]]], axis=1)
X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3232)
pls_model = PLSRegression().fit(X_train, y_train)
#print(pls_model.intercept_)
print(pls_model.coef_)

[[ 23.41189525]
 [ 47.21536343]
 [  8.23578664]
 [ 39.6366971 ]
 [ 28.08139918]
 [ 35.9048357 ]
 [  3.69559952]
 [ 25.30165501]
 [ 38.05292994]
 [ 28.71762107]
 [ 39.83308613]
 [ 35.27335435]
 [ 16.57406637]
 [ 58.9205953 ]
 [ -2.13748543]
 [ -7.52410175]
 [ 29.56575798]
 [-56.35741172]
 [ 23.08841141]]


# Tahmin

In [13]:
pls_model.predict(X_train)

array([[ 6.59215662e+02],
       [ 3.13051401e+02],
       [ 6.62410444e+02],
       [ 5.93143456e+02],
       [ 4.67470984e+02],
       [ 4.08432965e+02],
       [ 7.31269374e+02],
       [ 3.51712444e+02],
       [ 7.45605418e+02],
       [ 2.89663303e+01],
       [-2.80546577e+01],
       [ 7.68515658e+02],
       [ 4.99753960e+02],
       [ 1.97330810e+02],
       [-1.68075006e+01],
       [ 3.73804015e+02],
       [ 6.08840426e+02],
       [ 3.55984728e+02],
       [ 7.55265629e+02],
       [ 9.27557378e+02],
       [ 2.61000154e+02],
       [ 7.80840005e+02],
       [ 4.40365083e+02],
       [ 1.31445314e+02],
       [ 1.49975749e+02],
       [ 3.44815480e+02],
       [ 4.13494103e+02],
       [ 5.48794914e+02],
       [ 1.02810709e+03],
       [ 1.18915579e+03],
       [ 1.00853693e+03],
       [ 2.69034294e+02],
       [ 8.78176624e+02],
       [ 6.46894072e+02],
       [ 2.03518308e+02],
       [ 6.35015566e+02],
       [ 1.85509599e+02],
       [ 9.16395500e+02],
       [ 4.4

In [17]:
y_pred = pls_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

335.3100497983514

In [18]:
y_pred = pls_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

300.8141648358929

# Model Tuning

In [19]:
pls_model = PLSRegression(n_components=2).fit(X_train, y_train)

In [20]:
y_pred = pls_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

300.8141648358929