## Финальный pipeline модели

In [2]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error as MSE, mean_absolute_percentage_error as MAPE
from sklearn.preprocessing import StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_regression, f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [3]:
# данные предобрабатываются из БД
df = pd.read_csv('../data/expanded_data_with_OSM.csv', sep = ',')
df_target = pd.read_csv('../data/train.csv', sep = ',')

regions_data = pd.read_csv('../data/regions_data.csv', sep = ';').rename(columns={'Unnamed: 0': 'regions'})
df = df.merge(regions_data, how='left', on='regions')

df = df.merge(df_target[['id', 'target']], how='left', on='id')

dff = pd.DataFrame(df.isna().sum()/len(df)).reset_index()
dff = dff[dff[0] > 0]

for col in dff[dff[0] > 0]['index']:
    if col != 'regions':
        df[col] = df[col].fillna(df[col].max())

df['regions'].replace(np.nan, 'Southern Federal District', inplace=True)

df = df[df['test_train_flag'] == 'train']
df['capital'] = np.where((df['cities'] == 'Moscow') | (df['cities'] == 'Saint Petersburg'), 1, 0)

df = df.merge(df.groupby('cities').agg({'target': 'mean'}).reset_index().rename(columns={'target': 'avgC'}),
         how='left', on='cities')

df = df.merge(df.groupby('regions').agg({'target': 'mean'}).reset_index().rename(columns={'target': 'avgR'}),
         how='left', on='regions')

df = df.merge(df.groupby('states').agg({'target': 'mean'}).reset_index().rename(columns={'target': 'avgS'}),
         how='left', on='states')

df = df.merge(df.groupby('atm_group').agg({'target': 'mean'}).reset_index().rename(columns={'target': 'avgA'}),
         how='left', on='atm_group')


df['atm_group'] = df['atm_group'].astype(int)

y = df['target'].reset_index(drop=True)
X = df.loc[:, ~df.columns.isin(['target', 'id', 'address', 'address_rus', 'lat', 'lng',
                                'test_train_flag', 'geometry'])].reset_index(drop=True)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
# import libraries
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config

set_config(transform_output="pandas")

# !!!!!!!!!!!
# pca = PCA(n_components=50)
pca = SelectKBest(score_func=f_regression, k=100)

# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# pipeline for numerical columns
num_pipe = make_pipeline(
#     SimpleImputer(strategy='mean'),
    StandardScaler()
)
# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

# build the model
rff = Pipeline([
    ('coltrans', full_pipe),
    ('pca', pca),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# train the model
rff.fit(X_train, y_train)

# make predictions on the test set
y_pred = rff.predict(X_test)

# measure quality
r2_test = r2_score(y_test, y_pred)
mse_test = MSE(y_test, y_pred)
rmse_test = MSE(y_test, y_pred, squared=False)
# mape_test = MAPE(y_test, y_pred)

# print(f"TRAIN: R2: {r2_train}, MSE: {cb_mse_train}, \
# RMSE: {cb_rmse_train}\n")
print(f"TEST: R2: {r2_test}, MSE: {mse_test}, \
RMSE: {rmse_test}")

TEST: R2: 0.7840559103741536, MSE: 0.0016100389298148434, RMSE: 0.040125290401626296


In [6]:
clf = rff[-1]

data = list(zip(clf.feature_names_in_, clf.feature_importances_))
df_importances = pd.DataFrame(data, columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)
df_importances

Unnamed: 0,Feature,Importance
31,num__avgA,3.372994e-01
0,num__atm_group,3.156379e-01
28,num__avgC,9.800435e-02
9,num__distance_to_mobile_phone_shop,1.574106e-02
14,num__distance_to_bank_Росбанк,1.262276e-02
...,...,...
51,cat__cities_Maloyaroslavets,4.286328e-06
53,cat__cities_Naberezhnye Chelny,1.186140e-06
50,cat__cities_Luchegorsk,1.556823e-07
57,cat__cities_Nizhniy Kuranakh,6.158842e-08


In [7]:
cv_res = cross_validate(rff, X, y, cv=5, 
                        scoring='r2', n_jobs=-1, return_train_score=True, error_score='raise')
cv_res

{'fit_time': array([25.26803613, 27.30754209, 26.1176405 , 24.35919476, 26.90969205]),
 'score_time': array([1.67947388, 0.17671871, 0.95072508, 2.32744527, 0.17462707]),
 'test_score': array([0.75758619, 0.75165409, 0.7511676 , 0.7745748 , 0.73611165]),
 'train_score': array([0.95866012, 0.96012634, 0.95829881, 0.95772704, 0.9596965 ])}

In [8]:
import pickle 

with open('atm_best.pkl', 'wb') as f:
    pickle.dump(rff, f)