In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitive-data-science-predict-future-sales/items.csv
/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv
/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
/kaggle/input/competitive-data-science-predict-future-sales/shops.csv
/kaggle/input/competitive-data-science-predict-future-sales/test.csv


This section imports the necessary libraries:
* `pandas` for data manipulation.
* `numpy` for numerical operations.
* `matplotlib` for plottting graphs and visualisation outputs

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

This code loads the training and test datasets from CSV files into pandas DataFrames.

In [5]:
train=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")

In [8]:
train=train[:100000]
test=test[:100000]

In [5]:
test

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268
...,...,...,...
214195,214195,45,18454
214196,214196,45,16188
214197,214197,45,15757
214198,214198,45,19648


In [6]:
sample_submission=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [7]:
sample_submission

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
...,...,...
214195,214195,0.5
214196,214196,0.5
214197,214197,0.5
214198,214198,0.5


### PREPROCESSING THE DATASET

#### 1.Convert Date to Datetime Format
This converts the `date` column in the training set to a datetime format, making it easier to extract date-related features.

In [23]:
#Convert date to datetime format
train['date']=pd.to_datetime(train['date'],format='%d.%m.%Y')

#### 2. Create New Features
These lines create new columns in the training DataFrame for the month, year, and day extracted from the `date` column.

In [24]:
#Create new feature
train['month']=train['date'].dt.month
train['year']=train['date'].dt.year
train['day']=train['date'].dt.day

#### 3. Aggregate to Monthly Sales
This code aggregates daily sales to monthly sales:

* `groupby` groups the data by `date_block_num` (which represents the month), `shop_id`, and `item_id`.
* `agg` applies aggregation functions: summing up `item_cnt_day` to get monthly sales and averaging `item_price` to get the `mean price`.
* `reset_index` flattens the grouped data back into a DataFrame.

In [25]:
#Aggregate monthly sales
monthly_sales=train.groupby(['date_block_num','shop_id','item_id']).agg({
    'item_cnt_day':'sum',
    'item_price':'mean',
}).reset_index()

#### 4.Rename the Aggregated Column
This renames the aggregated item_cnt_day column to item_cnt_month for clarity.

In [26]:
#Rename the aggregated column
monthly_sales.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)

#### 5.Add Month and Year Features
These lines add month and year features to the monthly_sales DataFrame:

* `date_block_num` % 12 gives the month (0 to 11).
* `date_block_num` // 12 gives the year.

In [27]:
#Add month and year features
monthly_sales['month']=monthly_sales['date_block_num']%12
monthly_sales['year']=monthly_sales['date_block_num']//12

#### 6. Add Lag Features
This function creates lag features:

* `tmp` is a temporary DataFrame with the columns of interest.
* For each lag in `lags`, it shifts the `date_block_num` column by i to create a lagged version of the column.
* The shifted DataFrame is merged back into the original DataFrame `df`.

In [None]:
# Add lag features
def lag_feature(df, lags, col):
    tmp = df[['date_block_num', 'shop_id', 'item_id', col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num', 'shop_id', 'item_id', col + '_lag_' + str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    return d

#### 7.Create Lag Features
This applies the lag_feature function to create lag features for `item_cnt_month` with lags of 1, 2, and 3 months.

In [28]:
monthly_sales = lag_feature(monthly_sales, [1, 2, 3], 'item_cnt_month')

#### 8.Fill NaN Values
This fills any NaN values in the `monthly_sales` DataFrame with 0.

In [29]:
monthly_sales.fillna(0, inplace=True)

#### 9.Define the Features and Target
These lines define:

* `features`: The list of feature columns used for training the model.
* `target`: The target variable that we want to predict.


In [30]:
features = ['shop_id', 'item_id', 'item_price', 'month', 'year',
            'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']
target = 'item_cnt_month'

#### 10. Split the Data
This splits the data into training and validation sets based on `date_block_num`:

* Training data includes months 0 to 32.
* Validation data includes month 33.

In [31]:
X_train = monthly_sales[monthly_sales['date_block_num'] < 33][features]
y_train = monthly_sales[monthly_sales['date_block_num'] < 33][target]
X_valid = monthly_sales[monthly_sales['date_block_num'] == 33][features]
y_valid = monthly_sales[monthly_sales['date_block_num'] == 33][target]

#### 11.Prepare Test Data
This adds `month` and `year` features to the test DataFrame.

In [32]:
test['month'] = 34 % 12
test['year'] = 34 // 12

#### 12. Merge with Monthly Sales to Add Lag Features
These lines prepare the test data:

* Merge the test DataFrame with `monthly_sales` to get `item_price`.
* Apply the `lag_feature` function to add lag features.
* Fill NaN values with 0.
* Define `X_test` with the selected features.

In [None]:
test = pd.merge(test, monthly_sales[['shop_id', 'item_id', 'item_price']], on=['shop_id', 'item_id'], how='left')
test = lag_feature(test, [1, 2, 3], 'item_cnt_month')
test.fillna(0, inplace=True)
X_test = test[features]


### TRAIN THE MODEL