In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler, LabelEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold 

In [None]:
data_diamonds = pd.read_csv("data/diamonds.csv", index_col= 0) 

In [None]:
sns.set()
sns.pairplot(data_diamonds.iloc[:,:-3], kind='reg', diag_kind="kde",  plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.1}});

In [None]:
fig, axes = plt.subplots(7, 1, figsize=(5, 10))
ax = axes.ravel()
for i,j in enumerate([0,4,5,6,7,8,9]):
    f, bins = np.histogram(data_diamonds.iloc[:, j], bins=50)
    ax[i].hist(data_diamonds.iloc[:, j], bins=bins, alpha=.5)
    ax[i].set_title(data_diamonds.columns.values[j])
    ax[i].set_yticks(())
fig.tight_layout()

##### Удалим выбросы и нули по x, y, z

In [None]:
data_wo_out = data_diamonds[data_diamonds.y<=10]

In [None]:
data_wo_out = data_wo_out[data_wo_out.z<=6]

In [None]:
data_wo_out = data_wo_out[~(data_wo_out['x']==0) & ~(data_wo_out['y']==0) & ~(data_wo_out['z']==0)]

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10, 5))
ax = axes.ravel()
for i,j in enumerate([7,8,9]):
    f, bins = np.histogram(data_wo_out.iloc[:, j], bins=50)
    ax[i].hist(data_wo_out.iloc[:, j], bins=bins, alpha=.5)
    ax[i].set_title(data_wo_out.columns.values[j])
    ax[i].set_yticks(())
fig.tight_layout()

##### Создадим фичи объем и плотность алмаза

In [None]:
data_wo_out['volume'] = data_wo_out['x']*data_wo_out['y']*data_wo_out['z']

In [None]:
data_wo_out['density'] = data_wo_out['carat'] /data_wo_out['volume'] 

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 5))
ax = axes.ravel()
for i,j in enumerate([-2,-1]):
    f, bins = np.histogram(data_wo_out.iloc[:, j], bins=50)
    ax[i].hist(data_wo_out.iloc[:, j], bins=bins, alpha=.5)
    ax[i].set_title(data_wo_out.columns.values[j])
    ax[i].set_yticks(())
fig.tight_layout()

##### Пометим алмазы с сомнительной плотностью

In [None]:
data_wo_out['bad'] = (data_wo_out['density'] < 0.0055)|(data_wo_out['density'] > 0.007)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 5))
ax = axes.ravel()
for i,j in enumerate([-3,-2]):
    f, bins = np.histogram(data_wo_out[data_wo_out['bad'] == False].iloc[:, j], bins=50)
    ax[i].hist(data_wo_out[data_wo_out['bad'] == False].iloc[:, j], bins=bins, alpha=.5)
    ax[i].set_title(data_wo_out[data_wo_out['bad'] == False].columns.values[j])
    ax[i].set_yticks(())
fig.tight_layout()

In [None]:
data_wo_out['bad'] = pd.get_dummies(data_wo_out['bad'],drop_first=True)

#### [Создадим факторные переменные  ](https://ggplot2.tidyverse.org/reference/diamonds.html)

##### Качество огранки (Fair, Good, Very Good, Premium, Ideal)

In [None]:
data_wo_out['cut'].unique()

In [None]:
data_wo_out['cut'] = data_wo_out['cut'].replace({'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4})

In [None]:
sns.boxplot(x='cut',y='price',data=data_wo_out,palette='rainbow')

##### Прозрачность (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
    

In [None]:
data_wo_out['clarity'].unique()

In [None]:
data_wo_out['clarity'] = data_wo_out['clarity'].replace({'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7})

In [None]:
sns.boxplot(x='clarity',y='price',data=data_wo_out,palette='rainbow')

#####  Цвет (J (worst) to D (best))

In [None]:
data_wo_out['color'].unique()

##### [Дамми по группам цвета](https://awesomegems.com/diamondfacts.html)

-  D-F - the finest & brightest colorless diamonds, usually for the discriminating customer who can afford the most beautiful and the rare.
- G-H - very white & bright face-up, near colorless diamonds that are not as expensive, and when mounted in jewelry it is difficult to see the the difference between these and the higher color grades.
- I-J - not quite as bright or have very slight hardly noticeable tint.


In [None]:
data_wo_out['color_group'] = data_wo_out['color'].replace({'D':2,'E':2,'F':2,'G':1,'H':1,'I':0,'J':0})

In [None]:
sns.boxplot(x='color_group',y='price',data=data_wo_out,palette='rainbow')

In [None]:
data_wo_out['color'] = data_wo_out['color'].replace({'D':6,'E':5,'F':4,'G':3,'H':2,'I':1,'J':0})

In [None]:
sns.boxplot(x='color',y='price',data=data_wo_out,palette='rainbow')

In [None]:
dumdum = pd.get_dummies(data_wo_out['color_group'], drop_first=True)

In [None]:
data_wo_out = pd.concat([data_wo_out,dumdum], axis=1)

#####  Создадим бины по весу

In [None]:
data_ext = data_wo_out.copy()

In [None]:
np.array(data_ext['carat']).reshape(1, -1)

In [None]:
kb_ord = KBinsDiscretizer(n_bins=20, strategy='kmeans', encode='ordinal')
kb_ord.fit(data_ext[['carat']])
data_ord = kb_ord.transform(data_ext[['carat']])
data_ord[:5]

In [None]:
pd.value_counts(pd.Series(data_ord.flatten()))

In [None]:
data_ext['carat_ord'] = data_ord

In [None]:
sns.boxplot(x='carat_ord',y='price',data=data_ext,palette='rainbow')

##### Комбинация качеств

In [None]:
data_wo_out['score'] = 2*data_wo_out['cut'] + data_wo_out['color'] + data_wo_out['clarity']
data_ext['score_2'] = 2*data_ext['cut'] + data_ext['color'] + data_wo_out['clarity'] + data_ext['carat_ord']/3
data_ext['score_3'] = (2*data_ext['cut'] + data_ext['color'] + data_wo_out['clarity'])*data_ext['carat_ord']

In [None]:
data_ext.shape

In [None]:
plt.figure(figsize=(20,20)) 
sns.heatmap(data_ext.corr(), cmap='RdYlGn',square=True);  

#####  Создадим полиномиальные фичи

In [None]:
num_feat = ['carat', 'volume', 'density', 'depth', 'table']

In [None]:
for i in num_feat:
    data_ext[i+'_log'] = np.log(data_ext[i])

In [None]:
poly = PolynomialFeatures(degree=3, include_bias=False)
poly.fit(data_ext[num_feat])
data_poly = poly.transform(data_ext[num_feat])
data_poly = pd.DataFrame(data_poly)
data_poly.columns = poly.get_feature_names()
data_poly.index = data_ext.index

In [None]:
data_ext = pd.concat([data_ext, data_poly.iloc[:,5:]], axis=1)

#### Выбираем линейную модель

In [None]:
data_ext = shuffle(data_ext) # always shuffle your data to avoid any biases that may emerge b/c of some order.

In [None]:
Y = data_ext['price']
X = data_ext.drop(columns=['price'])

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [None]:
pipe = Pipeline([('linmod', LinearRegression())])

param_grid = [{'linmod': [LinearRegression()],
               'linmod__fit_intercept': [True, False]},
              {'linmod': [Ridge()],
               'linmod__alpha': [10**(x-5) for x in np.linspace(start = 1, stop = 10, num = 10)],
               'linmod__normalize': [True, False]},
              {'linmod': [Lasso()],
               'linmod__alpha': [10**(x-5) for x in np.linspace(start = 1, stop = 10, num = 10)],
               'linmod__normalize': [True, False]}
             ]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
ms_error = make_scorer(mean_squared_error, greater_is_better=False)

In [None]:
grid = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True, iid=True, scoring = ms_error)
grid.fit(X_train,Y_train)

In [None]:
print("----------------- Обучили и тестировали -------------------")
print("Наилучшие параметры:\n{}\n".format(grid.best_params_))
print("Средняя правильность для наилучшей модели кроссвалидации на валидационных тестовых наборах: {:.6f}\n".format(-grid.best_score_)) 
print("Правильность для наилучшей модели на тестовом наборе: {:.6f}\n".format(-grid.score(X_test, Y_test)))
gridresults = pd.DataFrame(grid.cv_results_)
gridresults[gridresults.columns.values[[x.startswith('split') for x in gridresults.columns.values]]] = - gridresults[gridresults.columns.values[[x.startswith('split') for x in gridresults.columns.values]]]
gridresults[gridresults.columns.values[[x.startswith('mean_t') for x in gridresults.columns.values]]] = - gridresults[gridresults.columns.values[[x.startswith('mean_t') for x in gridresults.columns.values]]]
display(gridresults.sort_values(["rank_test_score"]).T)