In [138]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


In [139]:
train = pd.read_csv("datasets/train.csv.zip", compression="zip", index_col="id", parse_dates=["date"])
test = pd.read_csv("datasets/test.csv", index_col="id", parse_dates=["date"])


In [140]:
oil = pd.read_csv("datasets/oil.csv", index_col="date", parse_dates=["date"])
oil["price"] = oil["dcoilwtico"]
oil.drop("dcoilwtico", inplace=True, axis="columns")


In [141]:
oil.price.fillna(method="bfill", inplace=True)
oil.head()

Unnamed: 0_level_0,price
date,Unnamed: 1_level_1
2013-01-01,93.14
2013-01-02,93.14
2013-01-03,92.97
2013-01-04,93.12
2013-01-07,93.2


In [142]:
train.head()


Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,2013-01-01,1,BABY CARE,0.0,0
2,2013-01-01,1,BEAUTY,0.0,0
3,2013-01-01,1,BEVERAGES,0.0,0
4,2013-01-01,1,BOOKS,0.0,0


In [143]:
test.head()


Unnamed: 0_level_0,date,store_nbr,family,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000888,2017-08-16,1,AUTOMOTIVE,0
3000889,2017-08-16,1,BABY CARE,0
3000890,2017-08-16,1,BEAUTY,2
3000891,2017-08-16,1,BEVERAGES,20
3000892,2017-08-16,1,BOOKS,0


In [144]:
## prepare family encoder
encoder = LabelEncoder()
encoder.fit(train.family)

LabelEncoder()

In [151]:
start_date = datetime(2013, 1, 1)
train["time"] = (train.date - start_date).dt.days
train["day_of_week"] = train.date.dt.dayofweek
train["day_of_month"] = train.date.dt.day
train["month"] = train.date.dt.month
train["category"] = encoder.transform(train.family)
train.drop(["price"], axis="columns", inplace=True, errors="ignore")
train = train.join(oil, on="date", how="left", rsuffix="oil")
train.price.fillna(method="bfill", inplace=True)


In [146]:
X = train.drop(["sales", "date", "family"], axis="columns")
y = train.sales

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

In [147]:
# model = RandomForestRegressor(n_estimators=50, max_depth=20, n_jobs=-1)
model = LinearRegression(normalize=True, n_jobs=-1)
model.fit(X_train, y_train)


LinearRegression(n_jobs=-1, normalize=True)

In [148]:
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)
best = 202496.51205666756
print(f"Error is {rmse}, which is {'worse' if best<rmse else 'better'}")


Error is 1197263.9740255235, which is worse


In [149]:
test["time"] = (test.date - start_date).dt.days
test["category"] = encoder.transform(test.family)
test["day_of_week"] = test.date.dt.dayofweek
test["day_of_month"] = test.date.dt.day
test["month"] = test.date.dt.month
test.drop(["price"], axis="columns", inplace=True, errors="ignore")
test = test.join(oil, on="date", how="left", rsuffix="oil")
test.price.fillna(method="bfill", inplace=True)
X_test = test.drop(["date", "family"], axis="columns")

In [150]:
y_test = model.predict(X_test)
submission = pd.DataFrame({"sales": y_test}, index=test.index)
submission.to_csv("datasets/submission.csv")
