In [None]:
import os
import json

import numpy as np
import seaborn as sns
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

Загрузка данных происходит ниже. Если она не срабатывает, самостоятельно скачайте файл `hw_final_open_data.npy` и положите его в ту же директорию, что и ноутбук.

In [None]:
assert os.path.exists('hw_final_open_data.npy'), 'Please, download `hw_final_open_data.npy` and place it in the working directory'
assert os.path.exists('hw_final_open_target.npy'), 'Please, download `hw_final_open_target.npy` and place it in the working directory'
data = np.load('hw_final_open_data.npy', allow_pickle=False)
target = np.load('hw_final_open_target.npy', allow_pickle=False)

assert os.path.exists('hw_final_closed_data.npy'), 'Please, download `hw_final_closed_data.npy` and place it in the working directory'
closed_data = np.load('hw_final_closed_data.npy', allow_pickle=False)
test_x = closed_data

Разбивка на `train` и `val` аналогичная разбиваке на `train` и `test`

In [None]:
from sklearn.decomposition import PCA

tmp_data = PCA(1).fit_transform(data)
data_sorted = data[np.argsort(tmp_data, axis=0).flatten()]
tmp_target = target[np.argsort(tmp_data, axis=0).flatten()]
f = (9-np.sqrt(65))/8
over_size = int((1-f) * tmp_data.shape[0])
inter_size = f/(1-f)

train_x_tmp, valid_x_tmp_1, train_y_tmp, valid_y_tmp_1 = data_sorted[:over_size], data_sorted[over_size:], tmp_target[:over_size], tmp_target[over_size:]

train_x, valid_x_tmp_2, train_y, valid_y_tmp_2 = train_test_split(train_x_tmp, train_y_tmp, test_size = inter_size, random_state=42)

valid_x, valid_y = np.vstack([valid_x_tmp_1, valid_x_tmp_2]), np.hstack([valid_y_tmp_1, valid_y_tmp_2])

### Feature selection

In [None]:
# на корреляции можно получить то же самое
from sklearn.feature_selection import SelectKBest

select_k = SelectKBest(k=3)
select_k.fit(train_x, train_y);

train_x, valid_x, test_x = select_k.transform(train_x), select_k.transform(valid_x), select_k.transform(test_x)

Посмотрим на оставшиеся признаки

In [None]:
sns.pairplot(pd.DataFrame(data).loc[:,select_k.get_support()])

Как будто бы признак 6 содержит только константные значения.

In [None]:
np.std(train_x[:,-1])

In [None]:
from sklearn.feature_selection import VarianceThreshold

select_std = VarianceThreshold(1e-10)
select_std.fit(train_x)

train_x, valid_x, test_x = select_std.transform(train_x), select_std.transform(valid_x), select_std.transform(test_x)

А из оставшихся двух второй это первый в квадрате

In [None]:
np.corrcoef(train_x[:,0]**2, train_x[:,1])

In [None]:
train_x, valid_x, test_x = train_x[:,0].ravel(), valid_x[:,0].ravel(), test_x[:,0].ravel()

### Checking if our `train-val` split is similair to `train-test`

In [None]:
fig, ax = plt.subplots(2)

ax[0].scatter(np.hstack([train_x, valid_x]), np.ones_like(np.hstack([train_x, valid_x])), c="k", label="train", zorder=1, s=5)
ax[0].scatter(test_x, np.ones_like(test_x), c="r", label="test", zorder=2, s=5)

ax[1].scatter(train_x, np.ones_like(train_x), c="k", label="train", zorder=1, s=5)
ax[1].scatter(valid_x, np.ones_like(valid_x), c="r", label="valid", zorder=2, s=5)

plt.legend();

### Function fitting

In [None]:
from scipy.optimize import curve_fit

def func(x, a, b, c, d):
    return a + b * np.log(1 + np.exp(c * (x - d)))

In [None]:
popt, pcov = curve_fit(func, train_x, train_y)

### Outlier removal

In [None]:
y_pred = func(train_x, *popt)

m = np.mean(np.abs(y_pred-train_y))
s = np.std(np.abs(y_pred-train_y))

mask = np.abs(y_pred-train_y) < m + 3 * s 

In [None]:
train_x, train_y = train_x[mask], train_y[mask]

### Refitting after outlier removal

In [None]:
popt, pcov = curve_fit(func, train_x, train_y)

### Perfomance evaluation

In [None]:
print(
    'train mse = %.3f ± %.3f' % (mean_squared_error(np.round(func(train_x, *popt), 2), np.round(train_y)),
                                   np.std((np.round(func(train_x, *popt), 2) - np.round(train_y))**2)),
    'validation mse =  %.3f ± %.3f' % (mean_squared_error(np.round(func(valid_x, *popt), 2), np.round(valid_y)),
                                         np.std((np.round(func(valid_x, *popt), 2) - np.round(valid_y))**2)),
    sep='\n'
)

### Predicting closed dataset

In [None]:
train_x = np.hstack([train_x, valid_x])
train_y = np.hstack([train_y, valid_y])

popt, pcov = curve_fit(func, train_x, train_y)

In [None]:
y_pred = func(train_x, *popt)

m = np.mean(np.abs(y_pred-train_y))
s = np.std(np.abs(y_pred-train_y))

mask = np.abs(y_pred-train_y) < m + 3 * s 

train_x, train_y = train_x[mask], train_y[mask]

In [None]:
popt, pcov = curve_fit(func, train_x, train_y)

In [None]:
predicted_values = np.round(func(test_x, *popt), 2)

assert predicted_values.shape == (closed_data.shape[0], ) # predictions should be just one-dimensional array

In [None]:
# do not change the code in the block below
# __________start of block__________
def float_list_to_comma_separated_str(_list):
    _list = list(np.round(np.array(_list), 2))
    return ','.join([str(x) for x in _list])

submission_dict = {
    'predictions': float_list_to_comma_separated_str(predicted_values)
}
with open('submission_dict_final_p01.json', 'w') as iofile:
    json.dump(submission_dict, iofile)
    
print('File saved to `submission_dict_final_p01.npy`')
# __________end of block__________

### Feature generation

In [None]:
def my_transformation(feature_matrix: np.ndarray):
    pivot_feature =  -0.6647707911870174 + 9.552041008683448 * np.log(1 + np.exp(43.97277925736383 * (feature_matrix[:, 4] - -0.4847561801397369)))
    return pivot_feature.reshape(-1,1)

In [None]:
transformed_train_x = my_transformation(train_x)

In [None]:
lr = Ridge()
lr.fit(transformed_train_x, train_y)

print(
    f'train mse =\t {mean_squared_error(lr.predict(transformed_train_x), train_y):.5f}',
    f'validation mse = {mean_squared_error(lr.predict(my_transformation(valid_x)), valid_y):.5f}',
    sep='\n'
)

Обращаем ваше внимание, что параметры линейной модели будут округляться до __четырех знаков после запятой__. Это не должно сильно повлиять на качество предсказаний:

In [None]:
original_predictions = lr.predict(transformed_train_x)
rounded_predictions = transformed_train_x.dot(np.round(lr.coef_, 4)) + np.round(lr.intercept_, 4)


assert np.allclose(original_predictions, rounded_predictions, atol=1e-3)

Параметры вашей модели:

In [None]:
w_list = list(np.round(lr.coef_, 4))
print(f'w = {list(np.round(lr.coef_, 4))}\nb = {np.round(lr.intercept_, 4)}')

Напоминаем, ваша модель не должна использовать более 15 параметров (14 весов плюс свободный член).

In [None]:
assert len(w_list) + 1 <= 15

##### Сдача второй части соревнования
Для сдачи вам достаточно отправить функцию `my_transformation` и параметры вашей модели в контест в задачу №2. Пример посылки доступен ниже

In [None]:
# __________example_submission_start__________
import numpy as np
def my_transformation(feature_matrix: np.ndarray):
    pivot_feature =  -0.6647707911870174 + 9.552041008683448 * np.log(1 + np.exp(43.97277925736383 * (feature_matrix[:, 4] - -0.4847561801397369)))
    return pivot_feature.reshape(-1,1)

w_submission = [1.0012]
b_submission = -0.0063
# __________example_submission_end__________