In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
matrix = pd.read_csv('C:/Users/Dindar/competitive-data-science-predict-future-sales(1)/tables/matrix.csv')

In [None]:
X_train = matrix[matrix.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = matrix[matrix.date_block_num < 33]['item_cnt_month']
X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
%%time
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

In [None]:
mean_squared_error(Y_valid, Y_pred), r2_score(Y_valid, Y_pred)

In [None]:
features = X_valid.columns
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
d = dict(zip(features, importances))

In [None]:
d = sorted(d.items(), key=lambda x: x[1], reverse=True)

In [None]:
col = []
for i in d[:10]:
    col.append(i[0])

In [28]:
col = ['item_cnt_month_lag_1',
 'date_item_city_avg_item_cnt_lag_1',
 'item_cnt_month_lag_2'
]

In [4]:
col = ['item_cnt_month_lag_1',
 'date_item_city_avg_item_cnt_lag_1',
 'item_cnt_month_lag_2',
 'item_category_id',
 'item_cnt_month_lag_3',
 'date_shop_cat_avg_item_cnt_lag_1',
 'item_cnt_month_lag_6',
 'date_item_avg_item_cnt_lag_1',
 'item_first_sale',
 'city_code']

In [6]:
ex = matrix[col]

In [7]:
del matrix

In [25]:
ex.shape

(6639294, 5)

In [None]:
6639294/4

In [5]:
from sklearn.neighbors import NearestNeighbors

In [None]:
%%time
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree', metric='braycurtis', n_jobs=-1).fit(ex.values)

In [None]:
%%time
distances, indices = nbrs.kneighbors(ex.values)

In [None]:
np.save('data1.npy', indices)

In [None]:
data = np.load('data1.npy')

# -----------------------------------------------------------------

In [12]:
from scipy.spatial import distance

In [33]:
q = list(range(100))

In [20]:
%%time
a = distance.cdist(ex[:100].values,ex[:100].values, 'braycurtis')

Wall time: 0 ns


In [21]:
a.shape, b.shape

((100, 100), (100, 100))

In [8]:
from sklearn.metrics import pairwise

In [25]:
%%time
b = pairwise.cosine_similarity(ex[:1000].values)

Wall time: 10.2 ms


In [26]:
b

array([[1.        , 0.95579811, 0.96175107, ..., 0.91838911, 0.9976657 ,
        0.97734484],
       [0.95579811, 1.        , 0.99973222, ..., 0.98857042, 0.97302126,
        0.99613244],
       [0.96175107, 0.99973222, 1.        , ..., 0.98630898, 0.97772748,
        0.99783313],
       ...,
       [0.91838911, 0.98857042, 0.98630898, ..., 1.        , 0.94052259,
        0.97724266],
       [0.9976657 , 0.97302126, 0.97772748, ..., 0.94052259, 1.        ,
        0.98908539],
       [0.97734484, 0.99613244, 0.99783313, ..., 0.97724266, 0.98908539,
        1.        ]])

In [23]:
a

array([[0.        , 0.28347899, 0.25237164, ..., 0.33149898, 0.23875493,
        0.23951338],
       [0.28347899, 0.        , 0.03467482, ..., 0.09647263, 0.1426712 ,
        0.14321701],
       [0.25237164, 0.03467482, 0.        , ..., 0.12236541, 0.10879315,
        0.10934372],
       ...,
       [0.33149898, 0.09647263, 0.12236541, ..., 0.        , 0.22087732,
        0.22140727],
       [0.23875493, 0.1426712 , 0.10879315, ..., 0.22087732, 0.        ,
        0.0005572 ],
       [0.23951338, 0.14321701, 0.10934372, ..., 0.22140727, 0.0005572 ,
        0.        ]])