### Область работы 1 (библиотеки)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler, OneHotEncoder, RobustScaler
from sklearn.preprocessing import PolynomialFeatures, QuantileTransformer, PowerTransformer
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.linear_model import ElasticNet, Ridge, Lasso, Lars
from sklearn.compose import ColumnTransformer, make_column_transformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, make_pipeline
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder, OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

In [2]:
cat_features = ['cut','color']
axis_features = ['meas_length','meas_width','meas_depth']
num_features = ['size','depth_percent','table_percent']

In [3]:
df = pd.read_csv('train.csv')
df.drop_duplicates(inplace = True)
x_test = pd.read_csv('test.csv')
x_train = df.drop(["total_sales_price"], axis = 1)
y_train = df['total_sales_price']

In [4]:
x_test

Unnamed: 0,size,color,clarity,cut,symmetry,polish,depth_percent,table_percent,meas_length,meas_width,meas_depth
0,0.53,E,SI2,Excellent,Excellent,Excellent,62.6,56.0,5.15,5.18,3.24
1,0.31,I,VVS1,Excellent,Excellent,Excellent,61.9,59.0,4.32,4.34,2.68
2,0.52,J,SI1,Excellent,Excellent,Excellent,62.7,56.0,5.15,5.18,3.24
3,0.50,K,VVS2,Very Good,Very Good,Very Good,64.3,58.0,4.92,4.97,3.18
4,0.30,K,VVS1,Excellent,Excellent,Excellent,62.8,57.0,4.26,4.28,2.69
...,...,...,...,...,...,...,...,...,...,...,...
22448,0.31,E,VVS1,Excellent,Excellent,Excellent,61.6,59.0,4.34,4.36,2.68
22449,0.50,H,VS1,Excellent,Excellent,Excellent,62.4,56.0,5.05,5.09,3.16
22450,0.31,F,IF,Excellent,Excellent,Excellent,62.4,57.0,4.34,4.36,2.71
22451,0.50,E,VS2,Excellent,Very Good,Excellent,63.0,57.0,5.00,5.07,3.17


In [5]:
x_train

Unnamed: 0,size,color,clarity,cut,symmetry,polish,depth_percent,table_percent,meas_length,meas_width,meas_depth
0,0.50,K,SI2,Excellent,Excellent,Excellent,61.4,55.0,5.10,5.12,3.14
1,0.50,E,VVS2,Excellent,Excellent,Very Good,61.9,60.0,5.06,5.09,3.14
2,0.35,G,VS2,Excellent,Excellent,Excellent,63.0,55.0,4.47,4.51,2.83
3,0.30,E,SI2,Excellent,Excellent,Excellent,63.2,57.0,4.24,4.27,2.69
4,0.30,F,VS2,Very Good,Very Good,Excellent,63.4,61.0,4.24,4.26,2.69
...,...,...,...,...,...,...,...,...,...,...,...
67593,0.30,D,SI2,Very Good,Very Good,Excellent,64.4,55.0,4.19,4.21,2.71
67594,0.60,H,VS2,Excellent,Excellent,Excellent,62.4,59.0,5.40,5.42,3.38
67595,0.36,L,VVS2,Excellent,Excellent,Excellent,62.3,55.0,4.55,4.59,2.85
67596,0.41,J,SI1,Excellent,Excellent,Excellent,62.7,57.0,4.74,4.78,2.98


In [6]:
y_train

0         990
1        3384
2        1154
3         886
4         864
         ... 
67593     640
67594    2932
67595     788
67596    1074
67597     646
Name: total_sales_price, Length: 67406, dtype: int64

In [7]:
axis_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=0, strategy='mean', add_indicator=False)),
    ('polynom', PolynomialFeatures(2,include_bias=False)),
    ('scaler', StandardScaler())])

clarity_map = [{
    'col':'clarity',
    'mapping':{'FL':10,
               'IF':9,
               'VVS1':8,
               'VVS2':7,
               'VS1':6,
               'VS2':5,
               'SI1':4,
               'SI2':3,
               'I1':2,
               'I2':1,
               'I3':0}
    }]

num_transformer = Pipeline(steps=[
    ('missing_num',IterativeImputer(missing_values=0,max_iter=20)),
    ('scaler',MinMaxScaler())])

clarity_transformer = Pipeline(steps=[
    ('ce',ce.OrdinalEncoder(mapping=clarity_map)),
    ('scaler', StandardScaler())
    ])
    
CT = ColumnTransformer([
        ("pol_std", axis_transformer, axis_features),
        ("num", num_transformer, num_features),
        ("cat", OneHotEncoder(), cat_features),
        ("ordinal_map", clarity_transformer, ['clarity'])
])

In [8]:
model_target =  TransformedTargetRegressor(
                regressor = Pipeline(steps=[
                                ('preproc', CT),
                                ('estimator', KNeighborsRegressor(n_neighbors=9, p=2, weights = 'distance'))
                            ]),
                transformer = PowerTransformer()
)

model_target.fit(x_train, y_train)

In [9]:
y_predict = model_target.predict(x_test)

In [10]:
y_predict

array([2112.59919952,  892.33699066, 1793.59952596, ..., 1231.52732132,
       2950.26128412, 1676.33089353])