## МЕГАФОН - курсовой проект - часть 2
файл, в котором производится загрузка модели, тестовых данных и датасет с признаками. на выходе получается файл. answers_test.csv. В этом файле находится 4 столбца: buy_time, id, vas_id и target. 

In [2]:
import time
start_time = time.time()

In [3]:
# библиотеки
import pandas as pd
import numpy as np
import dask.dataframe as dd
from datetime import date
import pickle 

# разделение данных
from sklearn.model_selection import train_test_split

# графики
import seaborn as sns
from scipy.stats import boxcox, probplot
import matplotlib.pyplot as plt
%matplotlib inline

# метрики
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

# Модели
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as catb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# балансировка классов
from imblearn.under_sampling import RandomUnderSampler

# пайппланы
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [4]:
# укажем путь нахождения данных
FEATURES_PATH = 'F:/Megafon/features/features.csv'
users_data_df_PATH = 'F:/Megafon/data_test.csv'

In [5]:
# загрузм признаки
test_df = pd.read_csv(users_data_df_PATH)
features_ddf = dd.read_csv(FEATURES_PATH, sep="\t")
features_ddf = features_ddf.drop('Unnamed: 0', axis=1)

In [6]:
test_df['date'] = pd.to_datetime(test_df['buy_time'], errors ="coerce")
test_df.astype("int64").dtypes
test_df['week_on_month'] = test_df['date'].dt.week
test_df['day'] = test_df['date'].dt.day
test_df['month'] = test_df['date'].dt.month

In [7]:
features_pd = features_ddf.compute()

In [8]:
# сортировка данных перед объединением тренировочного датасета и датасета с признаками
features_pd = features_pd.sort_values(by = ['id','buy_time'], ascending = [True,True])

In [9]:
test_df = test_df.sort_values(by = ['id','buy_time'], ascending = [True,True])

In [10]:
# объединение
result = pd.merge_asof(test_df , features_pd, on="id", by="buy_time", direction="nearest")

In [11]:
#удалим строки с пропусками
result = result[result.vas_id.notnull()]

In [12]:
# уберем признаки, связанные с датой и временем, и с id
result = result.drop(['id','date','buy_time'], axis=1)

In [13]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)
            
class NumberSelector(BaseEstimator, TransformerMixin):    
    def __init__(self, key):
        self.key = key        

    def fit(self, X, y=None):        
        return self

    def transform(self, X):        
        return X[[self.key]]

In [14]:
MODEL_FILE_PATH = './model.pkl'

In [15]:
with open(MODEL_FILE_PATH, 'rb') as file:
    my_model = pickle.load(file)

In [16]:
f_ok = my_model[0][0].columns

In [17]:
result = result[f_ok]

In [18]:
y_predict_proba = my_model.predict_proba(result)[:, 1]
y_predict_proba

array([0.01695848, 0.8066052 , 0.01716046, ..., 0.0201329 , 0.015119  ,
       0.02220865], dtype=float32)

In [19]:
answers_test = test_df
answers_test['target']=y_predict_proba

In [20]:
answers_test.shape

(71231, 9)

In [21]:
col = ['id', 'vas_id', 'buy_time', 'target']
answers_test = answers_test[col]
answers_test

Unnamed: 0,id,vas_id,buy_time,target
87,55,2.0,1547413200,0.016958
98,64,4.0,1548018000,0.806605
278,151,2.0,1547413200,0.017160
551,274,2.0,1547413200,0.019500
552,274,4.0,1548018000,0.830441
...,...,...,...,...
69747,4362676,2.0,1548018000,0.020617
69754,4362677,2.0,1547413200,0.019414
70157,4362697,5.0,1546808400,0.020133
70535,4362712,5.0,1547413200,0.015119


In [22]:
answers_test.to_csv('./answer_test.csv')

In [23]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 604.0477526187897 seconds ---
