# Day 08. Exercise 04
# Regression

## 0. Imports

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn import linear_model, metrics
from matplotlib import pyplot as plt
from collections import Counter

## 1. Preprocessing

1. Read the file [`checker_regression.csv`](https://drive.google.com/file/d/1L8auBzJEghWFewznhhFpcrqxW4GqcfRY/view?usp=sharing).
2. Fill the missing values in it with `0`.
3. Make a split of your dataset on train and test with parameters `test_size=0.2`.

In [None]:
with open('../data/checker_regression.csv', 'r') as fin:
    df = pd.read_csv(fin, sep=',', )
df.info()

Заполним недостающие значения нулями.

In [None]:
df.fillna(0, inplace=True)
df.info()
print()
df.drop_duplicates()
df.info()

Разделим данные на train и test с параметрами test_size=0.2

In [None]:
uid = {}
c = 0
for i in Counter(df.uid):
  uid[i] = c;
  c += 1
df["uid_num"] = [uid[df.uid[i]] for i in range(0, 77)]

scaller = StandardScaler()
X = df.drop(['uid', 'AVG(diff)'], axis=1).values
Y = df['AVG(diff)']

scaller.fit(X, Y)
X_scaled = scaller.transform(X)

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(X_scaled, Y, test_size = 0.2, random_state=21)

## 2. Crossvalidation

1. Write a function `crossval` that takes as arguments: `n_splits` for `KFold()`, `X`, `y`, model instantiated class with the parameters of the model (keep in mind: `random_state=21`) and returns for a given model class a result like this:

```
train -  2696.4496895424836   |   test -  1589.9979527104958
train -  2660.957874001452   |   test -  2075.102636027137
train -  2847.315529246795   |   test -  320.911928168403
train -  2500.7691099659237   |   test -  4132.461382030178
train -  2643.927917295123   |   test -  2237.8140952197878
train -  2396.295678819444   |   test -  4509.650064742476
train -  2003.402267924976   |   test -  8403.491474908551
train -  2531.876094212613   |   test -  3135.944102735099
train -  2683.1795186023123   |   test -  1796.01426292594
train -  2537.1192483996338   |   test -  3439.29824116941
Average RMSE on crossval is 3164.0686140637476
```

2. Run the function for `LinearRegression`, `DecisionTreeRegressor`, `RandomForestRegressor`. You may choose the parameters by yourself, but find the good enough for you.

In [None]:
def crossval(n_splits, X, y, model):
  train_data, test_data, train_labels, test_labels = train_test_split(X, y, test_size = 0.2, random_state=21)

  ft_m = model.fit(train_data, train_labels)
  scores = cross_val_score(model, train_data, train_labels, cv=n_splits)
  scores_test = cross_val_score(model, test_data, test_labels, cv=n_splits)
  predict = cross_val_predict(ft_m, test_data, test_labels, cv=n_splits)
  accuracy = metrics.mean_squared_error(test_labels, predict)

  for i in range(0, len(scores)):
    print(f'train - {scores[i]} | test - {scores_test[i]}')
  print(f'Average RMSE on crossval is {accuracy}')

In [None]:
regr = linear_model.LinearRegression()
crossval(6, X_scaled, Y, regr)

Запустим функцию для LinearRegression, DecisionTreeRegressor, RandomForestRegressor.

In [None]:
treeRegr = DecisionTreeRegressor(random_state=21)
crossval(6, X_scaled, Y, treeRegr)

In [None]:
rFrstRegr = RandomForestRegressor(random_state = 21)
crossval(6, X_scaled, Y, rFrstRegr)

## 3. Predictions and evaluation

1. Make predictions for the test dataset using each of the three models with the finalized parameters.
2. Draw a plot for each of the models where the `x-axis` is the actual average difference and the `y-axis` is the prediction made by a model.
3. How would plot look like in the ideal case? Put the answer to the markdown cell in the end of the section.

Сделаем прогнозы для тестового набора данных, используя каждую из трех моделей с окончательными параметрами.

In [None]:
ft_m = regr.fit(train_data, train_labels)
predict = cross_val_predict(ft_m, test_data, test_labels, cv=6)
picasso = pd.DataFrame()
picasso["test_labels"] = test_labels
picasso["predict"] = predict


plt.scatter(test_labels, predict, cmap='Accent')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [None]:
ft_m = treeRegr.fit(train_data, train_labels)
predict = cross_val_predict(ft_m, test_data, test_labels, cv=6)
picasso_tree = pd.DataFrame()
picasso_tree["test_labels"] = test_labels
picasso_tree["predict"] = predict


plt.scatter(test_labels, predict, cmap='Accent')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [None]:
ft_m = rFrstRegr.fit(train_data, train_labels)
predict = cross_val_predict(ft_m, test_data, test_labels, cv=6)
picasso_rndFrst = pd.DataFrame()
picasso_rndFrst["test_labels"] = test_labels
picasso_rndFrst["predict"] = predict


plt.scatter(test_labels, predict, cmap='Accent')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

В идеальном случае - это прямая кх+в под углом 45 градусов первую четверть