https://bit.ly/FEML-5-PROB

# Влияние предобработки данных на модель

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression

In [None]:
# Создаём сэмпл
n_samples = 1000

np.random.seed(9)
age_owner = np.random.choice(90, n_samples) + 21
length = np.random.choice(120, n_samples) + 15
width = np.random.choice(80, n_samples) + 10

price = length * width * 100 + 126

data = pd.DataFrame({
    'age_owner': age_owner,
    'length': length,
    'width': width,
    'price': price
})
data.head(5)

Unnamed: 0,age_owner,length,width,price
0,75,57,70,399126
1,77,39,60,234126
2,43,30,36,108126
3,86,30,68,204126
4,43,41,52,213326


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age_owner  1000 non-null   int64
 1   length     1000 non-null   int64
 2   width      1000 non-null   int64
 3   price      1000 non-null   int64
dtypes: int64(4)
memory usage: 31.4 KB


In [None]:
data.describe()

Unnamed: 0,age_owner,length,width,price
count,1000.0,1000.0,1000.0,1000.0
mean,65.461,74.375,50.207,370986.8
std,25.728981,33.975708,22.682342,248021.6
min,21.0,15.0,10.0,22126.0
25%,43.0,45.0,30.0,168101.0
50%,65.0,73.0,51.0,310626.0
75%,88.0,103.0,69.0,522801.0
max,110.0,134.0,89.0,1139126.0


In [None]:
data['length'].value_counts()

27     15
44     15
55     15
129    14
39     13
       ..
116     4
114     4
18      4
109     4
62      4
Name: length, Length: 120, dtype: int64

In [None]:
from sklearn.metrics import mean_absolute_error

X = data[['age_owner', 'length', 'width']]
y = data['price']


reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['age_owner', 'length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [-149.11046279 5071.76422338 7366.90372375]
Bias: -366334.87936744984
Error: 56547.371635505435


In [None]:
y.median()

310626.0

In [None]:
X = data[['length', 'width']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [5073.23397859 7365.79408501]
Bias: -376149.40078368207
Error: 56518.60518237617


In [None]:
# Создаем новый признак
data['mult'] = data['length'] * data['width']
data.head(5)

Unnamed: 0,age_owner,length,width,price,mult
0,75,57,70,399126,3990
1,77,39,60,234126,2340
2,43,30,36,108126,1080
3,86,30,68,204126,2040
4,43,41,52,213326,2132


In [None]:
X = data[['mult']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(X)
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [100.]
Bias: 126.0
Error: 5.82549546379596e-11


In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(
    degree=2,
    # include_bias=False,
    # interaction_only=True
)
X = poly.fit_transform(data[['age_owner', 'length', 'width']])
y = data['price']

poly.get_feature_names_out()

array(['1', 'age_owner', 'length', 'width', 'age_owner^2',
       'age_owner length', 'age_owner width', 'length^2', 'length width',
       'width^2'], dtype=object)

In [None]:
reg_poly = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg_poly.coef_))
print('Bias: {}'.format(reg_poly.intercept_))

pred_values = reg_poly.predict(X)
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ 0.00000000e+00  1.08624221e-12  2.17883350e-12  2.16048013e-12
 -1.93178806e-14  6.66133815e-15  3.28626015e-14  1.24344979e-14
  1.00000000e+02  1.03472786e-13]
Bias: 125.9999999992433
Error: 2.777560439426452e-10


# **Дополнительные материалы**
1. Типы данных https://youtu.be/c4Cg3TUIH0E 
2. Метрика ROC-AUC https://youtu.be/v6PPG8J_Egs
3. Метрика F1-score https://youtu.be/PeE3Fkt5W3Q
4. Масштабирование данных (StandartScaler, MinMaxScaler) https://youtu.be/XsuCOfpf8Ic