In [1]:
import pandas as pd, numpy as np
from tqdm import tqdm_notebook
import time
import urllib.request as ur

In [2]:
train = pd.read_csv('data/train.csv', index_col = 'id')
test = pd.read_csv('data/test.csv', index_col = 'id')

In [3]:
train['date'] = pd.to_datetime(train['date'], format='%Y+AC0-%m+AC0-%d').astype(str)
test['date'] = pd.to_datetime(test['date'], format='%Y+AC0-%m+AC0-%d').astype(str)

In [4]:
def holidays(d, m, year):
    time.sleep(0.2)
    elevations = ur.urlopen("https://kayaposoft.com/enrico/json/v2.0/?action=isPublicHoliday&date="+str(d)+'-'+str(m)+'-'+str(year)+"&country=ru").read()
    data = json.loads(elevations)
    if data['isPublicHoliday'] == True:
        return 1
    else:
        return 0

def extr(df, date_column):
    df['tm_year'] = df[date_column].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[0])
    df['tm_mon'] = df[date_column].apply(lambda x:  time.strptime(str(x), '%Y-%m-%d')[1])
    df['tm_mday'] = df[date_column].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[2])
    df['tm_wday'] = df[date_column].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[6])
    df['tm_yday'] = df[date_column].apply(lambda x: time.strptime(str(x), '%Y-%m-%d')[7])
    del df[date_column]
    return df

In [5]:
train = extr(train, 'date')
test = extr(test, 'date')

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
enc = LabelEncoder()
train['name'] = enc.fit_transform(train['name'])
test['name'] = enc.transform(test['name'])

In [8]:
X = train[[c for c in train.columns if c!= 'market']]
y = train['market']

In [9]:
from sklearn.model_selection import cross_validate, TimeSeriesSplit

In [10]:
import xgboost as xgb

In [11]:
clf = xgb.XGBRegressor()

In [12]:
cv = TimeSeriesSplit(n_splits=8)

In [13]:
round(100.003, 2)

100.0

In [14]:
pd.DataFrame(cross_validate(clf, X, y, cv = cv, scoring='neg_mean_squared_error')).mean().apply(lambda x: round(np.sqrt(abs(x)), 2))



fit_time             0.18
score_time           0.03
test_score     3015939.15
train_score     406802.73
dtype: float64

In [15]:
clf.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [16]:
y_pred = clf.predict(test)

In [17]:
sample = pd.read_csv('data/sample_submission.csv', index_col='id')

In [18]:
sample['market'] = y_pred

In [19]:
sample.to_csv('results.csv')