In [1]:
# Common imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# to make this notebook's output stable across runs
np.random.seed(7)

# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

In [2]:
adv_df = pd.read_csv('/home/lena/Netology/Feature Engineering/Lecture_4_new/Practice_3_media_cut/Advertising.csv')
adv_df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [3]:
adv_df = adv_df.drop('Unnamed: 0', axis = 1)
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
adv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
TV           200 non-null float64
radio        200 non-null float64
newspaper    200 non-null float64
sales        200 non-null float64
dtypes: float64(4)
memory usage: 6.3 KB


# Домашнее задание
Разделить дата сет на трейн и тест в отношение 50:50 70:30 80:20 (с перемешиванием)
Обучать наши модели на трейне. Предсказывать и замерять метрику R^2 и на трейне и на тесте
Проверить следующие модели, для каждого разделения: а) sales ~ log_tv + radio б) sales ~ TV + radio в) sales ~ TV + radio + newspaper

# 1. Разделить дата сет на трейн и тест в отношение 50:50 70:30 80:20 (с перемешиванием)

In [5]:
x = adv_df.copy()
x.drop(['sales'], inplace=True, axis=1)
y = adv_df['sales'].copy()

In [6]:
x.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train_05, x_test_05, y_train_05, y_test_05 = train_test_split(x, y, test_size=0.5, shuffle = True, random_state=7)

In [9]:
x_train_07, x_test_03, y_train_07, y_test_03 = train_test_split(x, y, test_size=0.3, shuffle = True, random_state=7)

In [10]:
x_train_08, x_test_02, y_train_08, y_test_02 = train_test_split(x, y, test_size=0.2, shuffle = True, random_state=7)

# 2. Обучать наши модели на трейне. Предсказывать и замерять метрику R^2 и на трейне и на тесте

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
model = LinearRegression()

In [13]:
model.fit(x_train_05, y_train_05)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
from sklearn.metrics import r2_score
train_pred=model.predict(x_test_05)
r2_score(y_test_05, train_pred)

0.8915506500621205

In [15]:
# проверим корректность R2 вручную
import numpy as np
rss = 0
tss = 0
for i in zip(train_pred, y_test_05): 
    rss += (i[1]-i[0])**2
    tss += (i[1] - np.mean(y_test_05))**2   
print((tss-rss)/tss)

0.8915506500621205


In [16]:
print('R2 score for test:', model.score(x_test_05, y_test_05))
print('R2 score for train:', model.score(x_train_05, y_train_05))

R2 score for test: 0.8915506500621205
R2 score for train: 0.8971789551990024


# 3. Проверить следующие модели, для каждого разделения: а) sales ~ log_tv + radio б) sales ~ TV + radio в) sales ~ TV + radio + newspaper

# а) sales ~ log_tv + radio

In [17]:
import math

In [18]:
adv_df['log_tv'] = adv_df.TV.apply(lambda x: math.pow(x, 0.4))

In [19]:
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales,log_tv
0,230.1,37.8,69.2,22.1,8.805756
1,44.5,39.3,45.1,10.4,4.563983
2,17.2,45.9,69.3,9.3,3.120408
3,151.5,41.3,58.5,18.5,7.450151
4,180.8,10.8,58.4,12.9,7.996121


In [20]:
x = adv_df.copy()
x = x[['log_tv', 'radio']]
y = adv_df['sales'].copy()

In [21]:
# Разделим дата сет на трейн и тест в отношение 50:50

In [22]:
x_train_05, x_test_05, y_train_05, y_test_05 = train_test_split(x, y, test_size=0.5, shuffle = True, random_state=7)

In [23]:
model = LinearRegression()
model.fit(x_train_05, y_train_05)
train_pred=model.predict(x_test_05)
r2_score(y_test_05, train_pred)

0.9153956674965309

In [24]:
# Разделим дата сет на трейн и тест в отношение 70:30

In [25]:
x_train_07, x_test_03, y_train_07, y_test_03 = train_test_split(x, y, test_size=0.3, shuffle = True, random_state=7)

In [26]:
model = LinearRegression()
model.fit(x_train_07, y_train_07)
train_pred=model.predict(x_test_03)
r2_score(y_test_03, train_pred)

0.9279408976069344

In [27]:
# Разделим дата сет на трейн и тест в отношение 80:20

In [28]:
x_train_08, x_test_02, y_train_08, y_test_02 = train_test_split(x, y, test_size=0.2, shuffle = True, random_state=7)

In [29]:
model = LinearRegression()
model.fit(x_train_08, y_train_08)
train_pred=model.predict(x_test_02)
r2_score(y_test_02, train_pred)

0.9331248172508638

# б) sales ~ TV + radio

In [30]:
x2 = adv_df.copy()
x2 = x2[['TV', 'radio']]
y2 = adv_df['sales'].copy()

In [31]:
# Разделим дата сет на трейн и тест в отношение 50:50

In [32]:
x_train_05, x_test_05, y_train_05, y_test_05 = train_test_split(x2, y2, test_size=0.5, shuffle = True, random_state=7)

In [33]:
model = LinearRegression()
model.fit(x_train_05, y_train_05)
train_pred=model.predict(x_test_05)
r2_score(y_test_05, train_pred)

0.8919937852058213

In [34]:
# Разделим дата сет на трейн и тест в отношение 70:30

In [35]:
x_train_07, x_test_03, y_train_07, y_test_03 = train_test_split(x2, y2, test_size=0.3, shuffle = True, random_state=7)

In [36]:
model = LinearRegression()
model.fit(x_train_07, y_train_07)
train_pred=model.predict(x_test_03)
r2_score(y_test_03, train_pred)

0.8894561428492667

In [37]:
# Разделим дата сет на трейн и тест в отношение 80:20

In [38]:
x_train_08, x_test_02, y_train_08, y_test_02 = train_test_split(x2, y2, test_size=0.2, shuffle = True, random_state=7)

In [39]:
model = LinearRegression()
model.fit(x_train_08, y_train_08)
train_pred=model.predict(x_test_02)
r2_score(y_test_02, train_pred)

0.9095363908842858

# в) sales ~ TV + radio + newspaper

In [40]:
x3 = adv_df.copy()
x3 = x3[['TV', 'radio', 'newspaper']]
y3 = adv_df['sales'].copy()

In [41]:
# Разделим дата сет на трейн и тест в отношение 50:50

In [42]:
x_train_05, x_test_05, y_train_05, y_test_05 = train_test_split(x3, y3, test_size=0.5, shuffle = True, random_state=7)

In [43]:
model = LinearRegression()
model.fit(x_train_05, y_train_05)
train_pred=model.predict(x_test_05)
r2_score(y_test_05, train_pred)

0.8915506500621205

In [44]:
# Разделим дата сет на трейн и тест в отношение 70:30

In [45]:
x_train_07, x_test_03, y_train_07, y_test_03 = train_test_split(x3, y3, test_size=0.3, shuffle = True, random_state=7)

In [46]:
model = LinearRegression()
model.fit(x_train_07, y_train_07)
train_pred=model.predict(x_test_03)
r2_score(y_test_03, train_pred)

0.8894586465158203

In [47]:
# Разделим дата сет на трейн и тест в отношение 80:20

In [48]:
x_train_08, x_test_02, y_train_08, y_test_02 = train_test_split(x3, y3, test_size=0.2, shuffle = True, random_state=7)

In [49]:
model = LinearRegression()
model.fit(x_train_08, y_train_08)
train_pred=model.predict(x_test_02)
r2_score(y_test_02, train_pred)

0.9095550600904052

Вывод:
Наилучшая точность прогноза на тесте получилась для модели log_tv + radio при разделении датасета на 80 и 20 - 93.31%.
Интересно, что для моделей TV + radio и TV + radio + newspaper точность при разделении модели на 70 и 30 получается хуже, чем при разделении 50:50 и 80:20