In [16]:
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
from numba import vectorize

from datetime import timedelta
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

In [32]:
def only_predict(model, data, test_indexes, week):
    preds = model.predict(data.iloc[test_indexes, train_columns].values)
    df_new["week_{}_day_{}_lags_{}".format(week, target_column.split("_")[1], 
                                           len(train_columns))] = preds

def eval_score(model, data, test_indexes, week):
    preds = model.predict(data.iloc[test_indexes, train_columns].values)
    df_new["week_{}_day_{}_lags_{}".format(week, target_column.split("_")[1], 
                                           len(train_columns))] = preds
    return np.sqrt(mean_squared_error(data.loc[test_indexes, target_column].values, 
                                      preds))

def ts_cv(data, folds):
    scores = []
    predictions = []
    for model, fold in enumerate(folds):
        en = ElasticNetCV(n_jobs=-1,fit_intercept=True,
                          max_iter=10000,n_alphas=1000,
                          selection='random',cv=5,eps=2e-3,
                          random_state=42)#,l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1])
        en.fit(data.iloc[fold[0], train_columns], data.loc[fold[0], target_column])
        if model == 0:
            only_predict(en, data, fold[1], model - 1)
        else:
            scores.append(eval_score(en, data, fold[1], model - 1))
        
    return scores
        

In [18]:
df = pd.read_csv("data/player_price.csv")
sample_subm = pd.read_csv("short_term_competition_benchmarks/kaggle_sample_submission.csv")

In [19]:
sample_subm["player_id"] = sample_subm["id"].apply(lambda x: int(x.split("_")[0]))
sample_subm["Date"] = sample_subm["id"].apply(lambda x: np.datetime64(x.split("_")[1]))

In [20]:
df = df[df["player_id"].isin(sample_subm["player_id"].unique())]
df['Date'] = pd.to_datetime(df['timestamp'], unit='ms')
df.drop(["timestamp", "ps_price", "player_name"], axis=1, inplace=True)

In [21]:
dates_dict = {}
for i, date in enumerate(df["Date"].sort_values(ascending=False).unique()):
    dates_dict[date] = i
df["weekDate"] = df["Date"].map(dates_dict) // 7
df["wdayDate"] = df["Date"].map(dates_dict) % 7
df["wdayDate"] = df["wdayDate"].apply(lambda x: "day_" + str(x))
df["Date"] = df["Date"].astype(str)

In [22]:
df = pd.pivot_table(df, values="xbox_price", columns="wdayDate", index=["player_id", "weekDate"]).reset_index()
df.columns = ['player_id', 'weekDate', 'day_0', 'day_1', 'day_2', 
              'day_3', 'day_4', 'day_5', 'day_6']

In [23]:
temp = df[df["weekDate"] == 0]
temp[["weekDate", 'day_0', 'day_1', 'day_2', 
              'day_3', 'day_4', 'day_5', 'day_6']] = -1
df = pd.concat([df, temp])

In [24]:
def create_lag_features(lag_week, feature='day_'):
    temp = df.copy()
    temp['weekDate'] = df['weekDate'].values - lag_week
    cols = []
    for i in range(7):
        cols.append("lag_{}_{}".format(lag_week, feature + str(i)))
        temp.rename(columns={"{}".format(feature + str(i)): cols[-1]}, inplace=True)
    return df.merge(temp[["weekDate", "player_id"] + cols], 
                                   how='left', on=["weekDate", "player_id"])

In [25]:
for lag in range(1, 5):
    df = create_lag_features(lag)

In [26]:
validation_weeks = [-1, 0]

In [27]:
def create_validation(week):
    return df[(df["weekDate"] > week) & (df["weekDate"] < week + 2)].index, df[df["weekDate"] == week].index

In [28]:
validation = []
for week in validation_weeks:
    validation.append(create_validation(week))

In [33]:
%%time
scores_cv = []
lags_values = range(1,7)
df_new = pd.DataFrame([])
for day in range(7):
    scores_cv.append([])
    for n_lag in lags_values:
        train_columns = list(range(9,9+n_lag))
        target_column = "day_{}".format(day)
        scores_cv[-1].append(ts_cv(df.fillna(0), validation))

Wall time: 1min 32s


In [34]:
best_days = [lags_values[i] for i in np.argmin(np.mean(np.array(scores_cv), axis=2), axis=1)]

In [35]:
best_days

[1, 5, 1, 1, 1, 1, 2]

In [36]:
np.mean([np.mean(np.array(scores_cv), axis=2)[i,j-1] for i, j in enumerate(best_days)])

19078.357214921816

In [37]:
df_new["player_id"] = df.loc[validation[0][1], "player_id"].values.copy()

In [38]:
pred_cols = ["week_-1_day_{}_lags_{}".format(day, lag) for day, lag in enumerate(best_days) ]
final_df = pd.melt(df_new, id_vars="player_id", value_vars=pred_cols)

In [39]:
d = ["2017-06-29", "2017-06-28", "2017-06-27", "2017-06-26", 
     "2017-06-25", "2017-06-24", "2017-06-23"]
dates = {}
for col, d in zip(pred_cols, d):
    dates[col] = d

In [40]:
final_df["variable"] = final_df["variable"].map(dates)
final_df.sort_values(["player_id", "variable"], inplace=True)
final_df.reset_index(drop=True, inplace=True)

In [41]:
sample_subm["price"] = final_df["value"]

In [42]:
sample_subm[["id", "price"]].to_csv("csv/prediction_ElasticNetCV_17500.csv", index=False)