## Çoklu Doğrusal Regresyon

In [8]:
import pandas as pd
ad = pd.read_csv("Advertising.csv", usecols = [1,2,3,4])
df = ad.copy()
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [12]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [13]:
X = df.drop("sales", axis = 1) 
y = df["sales"]   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [14]:
X_train.shape

(160, 3)

In [15]:
y_train.shape

(160,)

In [16]:
y_test.shape

(40,)

In [18]:
X_test.shape

(40, 3)

In [19]:
training = df.copy()

In [20]:
training.shape

(200, 4)

## Statsmodels

In [25]:
import pandas as pd

In [26]:
import statsmodels.api as sm

In [27]:
lm = sm.OLS(y_train, X_train)

In [28]:
model = lm.fit() 
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared (uncentered):,0.982
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,1.28e-137
Time:,22:53:56,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040

0,1,2,3
Omnibus:,11.405,Durbin-Watson:,1.895
Prob(Omnibus):,0.003,Jarque-Bera (JB):,15.574
Skew:,-0.432,Prob(JB):,0.000415
Kurtosis:,4.261,Cond. No.,13.5


In [None]:
# f istatistiği modelin anlamlılığını belirler. 
# r ve r kare bağımlı değişkenlerin bağımsız değişkenlerce açıklanabilirliğini verir. 

In [31]:
#özel bir kesit almak istersek
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0531,0.001,36.467,0.000,0.050,0.056
radio,0.2188,0.011,20.138,0.000,0.197,0.240
newspaper,0.0239,0.008,3.011,0.003,0.008,0.040


In [30]:
model.summary().tables[0]

0,1,2,3
Dep. Variable:,sales,R-squared (uncentered):,0.982
Model:,OLS,Adj. R-squared (uncentered):,0.982
Method:,Least Squares,F-statistic:,2935.0
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,1.28e-137
Time:,22:58:01,Log-Likelihood:,-336.65
No. Observations:,160,AIC:,679.3
Df Residuals:,157,BIC:,688.5
Df Model:,3,,
Covariance Type:,nonrobust,,


## Tahmin Başarısı:

scikit-learn model

In [38]:
from sklearn.linear_model import LinearRegression

In [39]:
lm = LinearRegression() 
model = lm.fit(X_train, y_train)

In [41]:
#sabit katsayı 
model.intercept_

2.979067338122629

In [43]:
#diğer tüm kat sayılar için 
model.coef_  #bağımsız değişken katsayıları

array([0.04472952, 0.18919505, 0.00276111])

## Tahmin

model denklemi: 

Sales = 2.97 + TV0.04 + radio0.18 + newspaper*0.002 

Örnegin 30 birim Tv harcaması, 10 birim radio harcaması, 40 birim gazete harcaması olduğunda tahmini değeri ne olur ?

In [45]:
yeni_veri = [[30], [10], [40]] 
yeni_veri = pd.DataFrame(yeni_veri).T

In [48]:
model.predict(yeni_veri) # satışların beklenen değeri

array([6.32334798])

In [58]:
import numpy as np 
from sklearn.metrics import mean_squared_error

In [59]:
#tahmin başarısı nedir ? #karesi alınır
rmse = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

In [61]:
rmse #eğiitim hatası

1.644727765644337

In [None]:
# test hatası hesaplama 
rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

In [62]:
rmse #test seti hatası

1.644727765644337

## Model Tuning / Model Doğrulama

In [63]:
cross_val_score(model, X, y , cv = 10, scoring = "r2")

array([0.87302696, 0.8581613 , 0.92968723, 0.89013272, 0.93146498,
       0.93138735, 0.7597901 , 0.91217097, 0.83891753, 0.92882311])

In [64]:
cross_val_score(model, X, y , cv = 10, scoring = "r2").mean()

0.8853562237979616

In [66]:
cross_val_score(model, X_train, y_train , cv = 10, scoring = "r2").mean()

0.791354859691634

In [68]:
cross_val_score(model, X_train, y_train , cv = 10, scoring = "neg_mean_squared_error")

array([-5.57303426, -2.86235681, -2.06504684, -1.09186983, -1.66159795,
       -2.50694042, -2.92821679, -2.01207197, -7.2250041 , -1.66156243])

In [70]:
np.sqrt(-cross_val_score(model,  
                         X_train, 
                         y_train ,  
                         cv = 10,  
                         scoring = "neg_mean_squared_error"))

array([2.36072749, 1.69185011, 1.43702708, 1.04492575, 1.28902985,
       1.58333206, 1.71120332, 1.41847523, 2.68793677, 1.28901607])

In [71]:
np.sqrt(-cross_val_score(model,  
                         X_train, 
                         y_train ,  
                         cv = 10,  
                         scoring = "neg_mean_squared_error")).mean()

1.6513523730313335

In [72]:
np.sqrt(-cross_val_score(model,  
                         X_test, 
                         y_test ,  
                         cv = 10,  
                         scoring = "neg_mean_squared_error")).mean()

1.8462778823997095

In [75]:
#train ile test hatası farklılaşıyor 
#eğitim hatası ile test hatası farklı yani