# ML Models

In [2]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('../data/expanded_data_with_OSM.csv', sep = ',')
df_target = pd.read_csv('../data/train.csv', sep = ',')

df = df.merge(df_target[['id', 'target']], how='left', on='id')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8345 entries, 0 to 8344
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               8345 non-null   float64
 1   atm_group                        8345 non-null   float64
 2   address                          8345 non-null   object 
 3   address_rus                      8345 non-null   object 
 4   lat                              8345 non-null   float64
 5   lng                              8345 non-null   float64
 6   test_train_flag                  8345 non-null   object 
 7   geometry                         8345 non-null   object 
 8   distance_to_fast_food            7627 non-null   float64
 9   distance_to_clothes              7656 non-null   float64
 10  distance_to_vending_parking      2006 non-null   float64
 11  distance_to_cafe                 8039 non-null   float64
 12  distance_to_pharmacy

### Заполняем пропуски максимальными значениями

In [4]:
dff = pd.DataFrame(df.isna().sum()/len(df)).reset_index()
dff = dff[dff[0] > 0]

for col in dff[dff[0] > 0]['index']:
    if col != 'regions':
        df[col] = df[col].fillna(df[col].max())

df['regions'].replace(np.nan, 'Southern Federal District', inplace=True)

OneHotEncoding - для категориальных фичей

In [6]:
data = df.copy()
data = data[data['test_train_flag'] == 'train']

data['atm_group'] = data['atm_group'].astype(int)
data = pd.get_dummies(data, columns=['atm_group'], drop_first=True) 
data = data.iloc[:, 7:]
# data = data.iloc[:, 8:]
data.drop(columns=['cities'], inplace=True)
# data = data.iloc[:, 6:]
X = pd.get_dummies(data, drop_first=True) 

y = X['target'].reset_index(drop=True)
X = X.loc[:, X.columns != 'target'].reset_index(drop=True)

### Разбиваем на train и тест

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### StandardScaler по отношению к признакам

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error as MSE
from sklearn.metrics import r2_score, mean_absolute_percentage_error as MAPE
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder

normalizer = StandardScaler()
X_real_norm_np = normalizer.fit_transform(X_train)
X_train = pd.DataFrame(data=X_real_norm_np)

normalizer = StandardScaler()
X_real_norm_np = normalizer.fit_transform(X_test)
X_test = pd.DataFrame(data=X_real_norm_np)


### Ridge регрессия

In [10]:
model = Ridge()

model.fit(X_train, y_train)

pred_mse_train = model.predict(X_train)
pred_mse_test = model.predict(X_test)

print(f"TRAIN: R2: {r2_score(y_train, pred_mse_train)}, MSE: {MSE(y_train, pred_mse_train)}, \
      RMSE: {MSE(y_train, pred_mse_train, squared=False)}")
print(f"TEST: R2: {r2_score(y_test, pred_mse_test)}, MSE: {MSE(y_test, pred_mse_test)}, \
          RMSE: {MSE(y_test, pred_mse_test, squared=False)}")

# model.coef_

TRAIN: R2: 0.7228252698746593, MSE: 0.0020578808308506173,       RMSE: 0.045363871427057646
TEST: R2: 0.7046131570910739, MSE: 0.0022023493083903785,           RMSE: 0.04692919462754905


### ElasticNet с GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

lasso_params = {'alpha': np.arange(1, 10, 1), 'l1_ratio': np.arange(0, 1.1, 0.1)}
opt_mod = GridSearchCV(ElasticNet(), param_grid=lasso_params, cv=10, n_jobs=-1)

model_opt_lasso = opt_mod.fit(X_train, y_train)

pred_opt_train = model_opt_lasso.predict(X_train)
pred_opt_test = model_opt_lasso.predict(X_test)

print(f"TRAIN: R2: {r2_score(y_train, pred_mse_train)}, MSE: {MSE(y_train, pred_mse_train)}, \
      RMSE: {MSE(y_train, pred_mse_train, squared=False)}")
print(f"TEST: R2: {r2_score(y_test, pred_mse_test)}, MSE: {MSE(y_test, pred_mse_test)}, \
          RMSE: {MSE(y_test, pred_mse_test, squared=False)}")

TRAIN: R2: 0.7228252698746593, MSE: 0.0020578808308506173,       RMSE: 0.045363871427057646
TEST: R2: 0.7046131570910739, MSE: 0.0022023493083903785,           RMSE: 0.04692919462754905


  model = cd_fast.enet_coordinate_descent(


In [12]:
model_opt_lasso.best_params_

{'alpha': 1, 'l1_ratio': 0.0}

Возспользуемся SelectKBest, чтобы посмотреть на качество модели при разном кол-ве признаков

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import f_classif, chi2, mutual_info_regression, f_regression


pipe = Pipeline([
#     ('cat_encoder_', LeaveOneOutEncoder(cols=[''])),
#     ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func=f_regression, k=100)), 
    ('model_', Ridge())]
)

cv_res = cross_validate(pipe, X, y, cv=5, scoring='r2', return_train_score=True)
cv_res

  corr /= X_norms
  corr /= X_norms
  corr /= X_norms
  corr /= X_norms


{'fit_time': array([0.03800011, 0.03800058, 0.03600049, 0.04400015, 0.03925228]),
 'score_time': array([0.00299907, 0.00400114, 0.00599813, 0.01499891, 0.00499916]),
 'test_score': array([0.69838467, 0.7183839 , 0.69739754, 0.71153407, 0.66788723]),
 'train_score': array([0.70812951, 0.70410112, 0.70860478, 0.70472045, 0.71513669])}

In [14]:
print(cv_res['train_score'].mean())
print(cv_res['test_score'].mean())

0.7081385101660376
0.6987174814424961


### Дерево решений

In [15]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()

tree.fit(X_train, y_train)

pred_train = tree.predict(X_train)
pred_test = tree.predict(X_test)

print(f"TRAIN: R2: {r2_score(y_train, pred_train)}, MSE: {MSE(y_train, pred_train)}, \
      RMSE: {MSE(y_train, pred_train, squared=False)}")
print(f"TEST: R2: {r2_score(y_test, pred_test)}, MSE: {MSE(y_test, pred_test)}, \
          RMSE: {MSE(y_test, pred_test, squared=False)}")

TRAIN: R2: 0.98784588750499, MSE: 9.02380787316815e-05,       RMSE: 0.009499372544104242
TEST: R2: 0.4742878308622587, MSE: 0.00391961206095386,           RMSE: 0.06260680522877574


In [16]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth' : np.arange(3, 10, 1),
          'max_features' : ["auto", "sqrt", "log2"],
          'min_samples_leaf': np.arange(3, 8, 1)}

gs = GridSearchCV(DecisionTreeRegressor(), params, cv=5, scoring='r2')

gs.fit(X_train, y_train)

pred_train = gs.predict(X_train)
pred_test = gs.predict(X_test)

print(f"TRAIN: R2: {r2_score(y_train, pred_train)}, MSE: {MSE(y_train, pred_train)}, \
RMSE: {MSE(y_train, pred_train, squared=False)}")
print(f"TEST: R2: {r2_score(y_test, pred_test)}, MSE: {MSE(y_test, pred_test)}, \
RMSE: {MSE(y_test, pred_test, squared=False)}")

TRAIN: R2: 0.7083483277641847, MSE: 0.0021653647333153463, RMSE: 0.046533479703492475
TEST: R2: 0.6841914688932698, MSE: 0.0023546096136757583, RMSE: 0.04852431981672446


In [17]:
gs.best_params_

{'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 7}