In [1]:
# Внимание!!! Важно, что бы файлы с данными и исполняемый файл находились в одной папке, 
# тогда пути к тестовым и тренировочным наборам будут содержать только имена файлов.
# 
# В пути к тренировочным и тестовым данным запрежается использовать абсалютную адресацию, 
# то есть адресацию, в которой присутствуют имена папок. Путь должен содержать только имя файла.
#
# Напоминание: под моделью машинного обучения понимаются все действия с исходными данными, 
# которые необходимо произвести, что бы сопоставить признаки целевому значению.

### Область работы 1 (библиотеки)

In [2]:
import pandas as pd
from pprint import pprint
import numpy as np
from sklearn.preprocessing import *
from sklearn.neighbors import *
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.svm import *
from sklearn.metrics import *
from sklearn.tree import *
import shap

### Область работы 2 (поиск лучшей модели)

In [3]:
path_train = "dim_train.csv"

In [4]:
data = pd.read_csv(path_train)

In [5]:
X = data.drop("price", axis=1)
Y = data["price"]

In [6]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.40,Premium,D,SI1,59.9,60.0,4.77,4.75,2.85
1,0.72,Very Good,D,VS2,61.8,58.0,5.73,5.76,3.55
2,1.10,Very Good,J,SI2,61.9,55.0,6.59,6.63,4.09
3,1.00,Very Good,F,VS1,62.1,60.0,6.33,6.42,3.96
4,0.31,Good,G,VS2,63.6,56.0,4.26,4.29,2.72
...,...,...,...,...,...,...,...,...,...
40450,2.01,Very Good,I,VS1,59.7,63.0,8.24,8.15,4.89
40451,0.53,Ideal,F,VVS1,61.4,57.0,5.20,5.23,3.20
40452,0.30,Ideal,J,IF,61.5,56.0,4.32,4.33,2.66
40453,0.70,Ideal,H,VS1,61.0,56.0,5.75,5.79,3.51


In [7]:
categorical_features = ["cut", "color", "clarity"]

In [8]:
onehotencoder = OneHotEncoder(categories = 'auto')

onehot = onehotencoder.fit_transform(X[categorical_features]).toarray()
X = pd.concat([X, pd.DataFrame(onehot)], axis = 1)

In [9]:
dimensions = ['x', 'y', 'z']
coord_scaler = MinMaxScaler()

X[dimensions] = coord_scaler.fit_transform(X[dimensions])

In [10]:
binging_features = ["depth", "table", "carat"]

In [11]:
kb_table = KBinsDiscretizer(n_bins = 7, strategy='uniform', encode='onehot-dense')

X_bins = kb_table.fit_transform(X[binging_features])
X_bins = np.concatenate((np.array(X["depth"]).reshape(-1, 1) * X_bins[:,   :7 ],
                         np.array(X["table"]).reshape(-1, 1) * X_bins[:, 7 :14],
                         np.array(X["carat"]).reshape(-1, 1) * X_bins[:, 14:  ]), axis=1)

X = pd.concat([X, pd.DataFrame(X_bins)], axis=1)

In [12]:
poly = PolynomialFeatures(degree = 2, include_bias = False)
poly.fit(X[binging_features])

X_poly = pd.DataFrame(poly.transform(X[binging_features]))

X = X.drop(binging_features, axis=1)
X = pd.concat([X, X_poly], axis=1)

In [13]:
cv = KFold(n_splits=3, random_state=0, shuffle = True)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, shuffle = True, random_state = 0)

In [15]:
def create_target_feature_encoding(X, Y, features):
  features_values_targets = {}
  
  for feature in features:
    features_values_targets[feature] = {}
    for value in set(X[feature]):
      features_values_targets[feature][value] = np.mean(Y[X[feature] == value])

  return features_values_targets

features_values_targets = create_target_feature_encoding(x_train, y_train, categorical_features)

In [16]:
for feature in categorical_features:
  x_train[feature] = x_train[feature].replace(features_values_targets[feature])
  x_test[feature]  = x_test [feature].replace(features_values_targets[feature])

In [17]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test  = np.array(x_test)
y_test  = np.array(y_test)

In [18]:
def print_metrics(targets, preds):
  print("MAE : ", mean_absolute_error(targets, preds))
  print("MSE : ", mean_squared_error(targets, preds))

In [19]:
svr = SVR()

svr.fit(x_train, y_train)

svr_preds = svr.predict(x_test)
print_metrics(y_test, svr_preds)

MAE :  2741.25606746756
MSE :  17621529.713779245


In [20]:
knn = KNeighborsRegressor()

parameters_grid = {
    "n_neighbors" : [3, 7, 9],
    "metric"  : ["euclidean", "manhattan", "minkowski"]
}

grid_cv_knn = GridSearchCV(knn, parameters_grid, scoring='neg_mean_absolute_error', cv = cv)
grid_cv_knn.fit(x_train, y_train)

In [21]:
knn_preds = grid_cv_knn.predict(x_test)
print_metrics(y_test, knn_preds)

MAE :  1204.7768806130014
MSE :  4646776.624435839


In [22]:
lin_reg = LinearRegression()

lin_reg.fit(x_train, y_train)

lin_reg_preds = lin_reg.predict(x_test)
print_metrics(y_test, lin_reg_preds)

MAE :  694.1072936651248
MSE :  1137058.8835632633


In [23]:
tree = DecisionTreeRegressor()

parameters_grid = {
    "max_depth"        : [11, 13, 15],
    "min_samples_leaf" : [3, 5, 7],
    "max_leaf_nodes"   : [None, 50, 70, 100] 
    }
  
grid_cv_tree = GridSearchCV(tree, parameters_grid, scoring='neg_mean_absolute_error', cv = cv)
grid_cv_tree.fit(x_train, y_train)

In [24]:
grid_cv_tree.best_params_

{'max_depth': 13, 'max_leaf_nodes': None, 'min_samples_leaf': 7}

In [25]:
tree_preds = grid_cv_tree.predict(x_test)
print_metrics(y_test, tree_preds)

MAE :  328.78777965252556
MSE :  403793.0035968912


In [26]:
forest = RandomForestRegressor()

parameters_grid = {
     'max_depth'        : [10, 30, 50],
     'n_estimators'     : [100, 300, 500]
     }


grid_cv_forest = GridSearchCV(forest, parameters_grid, scoring='neg_mean_absolute_error', cv = cv)
grid_cv_forest.fit(x_train, y_train)

In [27]:
grid_cv_forest.best_params_

{'max_depth': 50, 'n_estimators': 300}

In [28]:
forest_preds = grid_cv_forest.predict(x_test)
print_metrics(y_test, forest_preds)

MAE :  271.7609675288512
MSE :  303956.3556114911


### Область работы 3 (выполнение лучшей модели)

In [29]:
# Путь к тренировочному набору
path_train = "dim_train.csv"
# Путь к тестовому набору
path_test  = "dim_test.csv" # содержит только имя файла, без имен папок

In [30]:
data = pd.read_csv(path_train)

In [31]:
X_test = pd.read_csv(path_test)
X = data.drop("price", axis=1)
Y = data["price"]

In [32]:
categorical_features = ["cut", "color", "clarity"]

In [33]:
onehotencoder = OneHotEncoder(categories = 'auto')

onehot = onehotencoder.fit_transform(X[categorical_features]).toarray()
X = pd.concat([X, pd.DataFrame(onehot)], axis = 1)

In [34]:
def create_target_feature_encoding(X, Y, features):
  features_values_targets = {}
  
  for feature in features:
    features_values_targets[feature] = {}
    for value in set(X[feature]):
      features_values_targets[feature][value] = np.mean(Y[X[feature] == value])

  return features_values_targets

features_values_targets = create_target_feature_encoding(X, Y, categorical_features)

In [35]:
for feature in categorical_features:
  X[feature] = X[feature].replace(features_values_targets[feature])

In [36]:
dimensions = ['x', 'y', 'z']
coord_scaler = MinMaxScaler()

X[dimensions] = coord_scaler.fit_transform(X[dimensions])

In [37]:
binging_features = ["depth", "table", "carat"]

In [38]:
kb_table = KBinsDiscretizer(n_bins = 7, strategy='uniform', encode='onehot-dense')

X_bins = kb_table.fit_transform(X[binging_features])
X_bins = np.concatenate((np.array(X["depth"]).reshape(-1, 1) * X_bins[:,   :7 ],
                         np.array(X["table"]).reshape(-1, 1) * X_bins[:, 7 :14],
                         np.array(X["carat"]).reshape(-1, 1) * X_bins[:, 14:  ]), axis=1)

X = pd.concat([X, pd.DataFrame(X_bins)], axis=1)

In [39]:
poly = PolynomialFeatures(degree = 2, include_bias = False)
poly.fit(X[binging_features])

X_poly = pd.DataFrame(poly.transform(X[binging_features]))

X = X.drop(binging_features, axis=1)
X = pd.concat([X, X_poly], axis=1)

In [40]:
X = np.array(X)
Y = np.array(Y)

In [41]:
forest = RandomForestRegressor(max_depth = 30, n_estimators = 500)
forest.fit(X, Y)

In [42]:
onehot = onehotencoder.fit_transform(X_test[categorical_features]).toarray()
X_test = pd.concat([X_test, pd.DataFrame(onehot)], axis = 1)

for feature in categorical_features:
  X_test[feature] = X_test[feature].replace(features_values_targets[feature])

X_test[dimensions] = coord_scaler.transform(X_test[dimensions])

X_test_bins = kb_table.transform(X_test[binging_features])
X_test_bins = np.concatenate((np.array(X_test["depth"]).reshape(-1, 1) * X_test_bins[:,   :7 ],
                              np.array(X_test["table"]).reshape(-1, 1) * X_test_bins[:, 7 :14],
                              np.array(X_test["carat"]).reshape(-1, 1) * X_test_bins[:, 14:  ]), axis=1)

X_test = pd.concat([X_test, pd.DataFrame(X_test_bins)], axis=1)

X_test_poly = pd.DataFrame(poly.transform(X_test[binging_features]))
X_test = X_test.drop(binging_features, axis=1)
X_test = pd.concat([X_test, X_test_poly], axis=1)

X_test = np.array(X_test)



In [44]:
y_predict = forest.predict(X_test)