In [164]:
from google.colab import drive

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import random
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [165]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [166]:
# main_dir = "competitive-data-science-predict-future-sales"
main_dir = "/content/drive/My Drive/predict_future_sales/competitive-data-science-predict-future-sales"

In [167]:
X = np.load(f"{main_dir}/X.npy")
# X = np.load(f"{main_dir}/X_(1).npy")
X = np.delete(X, 1, 1).astype('int')
print(X.shape)
X

(214200, 1, 33)


array([[[0, 0, 0, ..., 1, 3, 1]],

       [[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 1, 3]],

       ...,

       [[1, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]]])

In [168]:
Y = np.load(f"{main_dir}/Y.npy")
Y = Y.reshape((214200, 1))
print(Y.shape)
Y

(214200, 1)


array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [196]:
# create train and test sets for X and Y

# train_size = int(len(X) * 0.67) 
# test_size = len(X) - train_size 
# train_x, test_x = X[0:train_size,:], X[train_size:len(X), :]
# train_y, test_y = Y[0:train_size,:], Y[train_size:len(Y), :]

bool_arr = np.random.choice([True, False], size=len(X), replace=True, p=[0.67, 0.33])
inverse_bool_arr = np.invert(bool_arr)

print(bool_arr.shape, bool_arr)
unique, counts = np.unique(bool_arr, return_counts=True)
print(np.asarray((unique, counts)).T)

train_x, test_x = X[bool_arr], X[inverse_bool_arr]
train_y, test_y = Y[bool_arr], Y[inverse_bool_arr]

train_x = train_x.reshape(train_x.shape[0], train_x.shape[1]*train_x.shape[2])
test_x = test_x.reshape(test_x.shape[0], test_x.shape[1]*test_x.shape[2])

(214200,) [ True  True  True ...  True False  True]
[[     0  70473]
 [     1 143727]]


In [197]:
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(143727, 33) (70473, 33) (143727, 1) (70473, 1)


In [172]:
reg = xgb.XGBRegressor(n_estimators=250)

In [198]:
reg.fit(train_x, train_y)



XGBRegressor(n_estimators=250)

In [199]:
predicts = reg.predict(test_x)
predicts = ((predicts.round(0)).reshape(len(predicts), 1)).astype('int')
predicts

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [200]:
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))[0]/len(test_y.flatten())

0.8633803016758191

In [201]:
unique, counts = np.unique(predicts, return_counts=True)
dict(zip(unique, counts))[0]/len(predicts.flatten())

0.9197848821534489

In [202]:
# print('Train Mean Absolute Error:', mean_absolute_error(train_y, predicts))
# print('Train Root Mean Squared Error:',np.sqrt(mean_squared_error(train_y, predicts)))
print('Test Mean Absolute Error:', mean_absolute_error(test_y, predicts))
print('Test Root Mean Squared Error:',np.sqrt(mean_squared_error(test_y, predicts)))

Test Mean Absolute Error: 0.25978743632313084
Test Root Mean Squared Error: 1.8952309738239632


In [203]:
normalized_rmse = np.sqrt(mean_squared_error(test_y, predicts))/(max(test_y)-min(test_y))
normalized_rmse

array([0.01148625])

In [204]:
# combine x and y for normalizing

def combine_x_y(x, y):
    ww = []
    ft = X.shape[1]
    for i in tqdm(range(len(x))):
      if ft == 5:
        ggi = [y[i][0], 0, x[i][2][0], x[i][3][0], x[i][4][0]]
      elif ft == 2: 
        ggi = [y[i][0], 0]
      elif ft == 1:
        ggi = [y[i][0]]
      ww.append(ggi)
    print(np.array(ww).shape)
    ww = np.array(ww, dtype='float32').reshape(214200, ft, 1)
    bg = np.concatenate([x, ww], axis=2)
    return bg

In [205]:
combined = combine_x_y(X, Y)
print(combined.shape)
combined

100%|██████████| 214200/214200 [00:00<00:00, 571962.93it/s]


(214200, 1)
(214200, 1, 34)


array([[[0., 0., 0., ..., 3., 1., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 1., 3., 1.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]]])

In [206]:
# combined_predict = model.predict(combined).round(0)
# combined_predict

In [207]:
# combined_predict.astype('int')

In [208]:
good_game = np.delete(combined, 0, 2)
good_game = good_game.reshape(len(good_game), good_game.shape[1]*good_game.shape[2])
good_game.shape

(214200, 33)

In [220]:
gg = reg.predict(good_game)
gg = ((gg.round(0)).reshape(len(gg), 1)).astype('int')
print(gg.shape)
gg

(214200, 1)


array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [221]:
# Sample submission file
df = pd.read_csv(f"{main_dir}/sample_submission.csv")
df

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
...,...,...
214195,214195,0.5
214196,214196,0.5
214197,214197,0.5
214198,214198,0.5


In [222]:
df['item_cnt_month'] = gg
df

Unnamed: 0,ID,item_cnt_month
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0
...,...,...
214195,214195,0
214196,214196,0
214197,214197,0
214198,214198,0


In [223]:
sav_path = f"{main_dir}/xg_result.csv"

In [224]:
df.to_csv(sav_path, index=False)

In [225]:
pd.read_csv(sav_path)

Unnamed: 0,ID,item_cnt_month
0,0,0
1,1,0
2,2,1
3,3,0
4,4,0
...,...,...
214195,214195,0
214196,214196,0
214197,214197,0
214198,214198,0


In [226]:
# combined = combined.reshape(len(combined), combined.shape[1]*combined.shape[2])

In [227]:
# combined_predict = reg.predict(combined).round(0)

In [228]:
# cc = combined_predict.astype('int')

In [229]:
# np.save(f"{main_dir}/cc.npy", cc)