In [10]:
# Import pandas and glob
import pandas as pd
import glob

In [12]:
# import all the filtered files here
file_pattern = 'D:\\Tarun\\UTS\\Subjects\\ADV ML\\output\\*.csv'

In [13]:
file_list = glob.glob(file_pattern)

In [14]:
dfs = []

In [15]:
# merge all the datasets into one combined dataset
for file in file_list:
    df = pd.read_csv(file)
    dfs.append(df)

In [16]:
combined_df = pd.concat(dfs, ignore_index=True)

In [17]:
combined_df.shape

(1959980, 15)

In [9]:
combined_df.head(5)

Unnamed: 0.1,Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,date,d,event_name,event_type,id,dept_id,cat_id,state_id,sale,sales_revenue
0,0,CA_1,HOBBIES_1_001,11328,8.26,2013-08-08,923,Eid al-Fitr,Religious,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1,HOBBIES,CA,0,0.0
1,1,CA_1,HOBBIES_1_001,11332,8.26,2013-09-02,948,LaborDay,National,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1,HOBBIES,CA,1,8.26
2,2,CA_1,HOBBIES_1_001,11338,8.26,2013-10-14,990,ColumbusDay,National,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1,HOBBIES,CA,0,0.0
3,3,CA_1,HOBBIES_1_001,11338,8.26,2013-10-15,991,EidAlAdha,Religious,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1,HOBBIES,CA,0,0.0
4,4,CA_1,HOBBIES_1_001,11340,8.26,2013-10-31,1007,Halloween,Cultural,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1,HOBBIES,CA,0,0.0


In [10]:
combined_df.dtypes

Unnamed: 0         int64
store_id          object
item_id           object
wm_yr_wk           int64
sell_price       float64
date              object
d                  int64
event_name        object
event_type        object
id                object
dept_id           object
cat_id            object
state_id          object
sale               int64
sales_revenue    float64
dtype: object

In [11]:
# check if any record is having the value greater than 1541 as this data is not needed.
combined_df = combined_df[combined_df['d'] <= 1541]

In [12]:
combined_df.shape

(1959980, 15)

In [49]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [45]:
# check if any data is present having date greater than 2015-04-18
combined_df_temp = combined_df[combined_df['date'] > '2015-04-18']

In [47]:
combined_df_temp.shape

(0, 15)

In [18]:
# create a new dataframe with the required features.
df = combined_df[['store_id', 'item_id', 'date', 'sales_revenue']]

In [32]:
df.head(5)

Unnamed: 0,store_id,item_id,date,sales_revenue
0,CA_1,HOBBIES_1_001,2013-08-08,0.0
1,CA_1,HOBBIES_1_001,2013-09-02,8.26
2,CA_1,HOBBIES_1_001,2013-10-14,0.0
3,CA_1,HOBBIES_1_001,2013-10-15,0.0
4,CA_1,HOBBIES_1_001,2013-10-31,0.0


In [33]:
# convert the date column to datetime format.
df = df.copy()
df['date'] = pd.to_datetime(df['date'])

In [11]:
df.isna().sum()

store_id         0
item_id          0
date             0
sales_revenue    0
dtype: int64

In [34]:
# split the datetime into date, month and year
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [35]:
# Perform the label Enconding on the store_id and item_id features.
from sklearn.preprocessing import LabelEncoder
le_store_id = LabelEncoder()
le_item_id = LabelEncoder()

In [36]:
df['le_store_id'] = le_store_id.fit_transform(df['store_id'])
df['le_item_id'] = le_item_id.fit_transform(df['item_id'])

In [55]:
df.head(5)

Unnamed: 0,sales_revenue,year,month,day,le_store_id,le_item_id
0,0.0,2013,8,8,0,1437
1,8.26,2013,9,2,0,1437
2,0.0,2013,10,14,0,1437
3,0.0,2013,10,15,0,1437
4,0.0,2013,10,31,0,1437


In [38]:
# encode the store id with original and encoding values. this will help in the service api.
encoding_map_store_id = dict(zip(df['store_id'], df['le_store_id']))

In [41]:
encoding_map_store_id['CA_3']

2

In [42]:
# encode the item id with original and encoding values. this will help in the service api.
encoding_map_item_id = dict(zip(df['item_id'], df['le_item_id']))

In [43]:
encoding_map_item_id['HOBBIES_1_001']

1437

In [44]:
# drop the columns which are not needed now.
df = df.drop(['store_id', 'item_id', 'date'], axis='columns')

In [45]:
# split the dataset into X and Y dataset.
df_X_train = df.drop(['sales_revenue'], axis='columns')
df_Y_train = df.sales_revenue

In [46]:
# Perform the train-test split for data modelling on the dataset.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_X_train, df_Y_train, train_size=0.8)
X_test.head(5)

Unnamed: 0,year,month,day,le_store_id,le_item_id
975799,2012,11,11,3,2061
1898079,2014,10,13,9,1755
1397088,2014,4,22,6,477
575586,2015,4,11,2,257
1179115,2013,10,15,4,2902


In [22]:
# Perform the Random Forest Regressor model on the dataset and check its MSE value and accuracy.
from sklearn.ensemble import RandomForestRegressor

In [23]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [24]:
rf_model.fit(X_train, Y_train)

In [25]:
predictions = rf_model.predict(X_test)

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
mse = mean_squared_error(Y_test, predictions)

In [28]:
mse

51.14164121132908

In [29]:
rf_model.score(X_test, Y_test)

0.4640421804867597

In [83]:
# pip install fastapi

Note: you may need to restart the kernel to use updated packages.


In [85]:
# pip install "uvicorn[standard]"




In [87]:
# pip install uvicorn

Note: you may need to restart the kernel to use updated packages.


In [1]:
from fastapi import FastAPI

In [2]:
app = FastAPI()

In [3]:
# initialise the fast api and check if its running with hello world.
@app.get("/")
def read_root():
    return {"Hello": "World"}

In [22]:
import datetime

In [58]:
# this method will use as a service api and will accept the input parameters as store_id, item_id and date 
#and will return the prediction on the input values.
@app.get("/predict_revenue")
def predict_revenue(
    store_id: str,
    item_id: str,
    date: str
):
    datetime_object = datetime.datetime.strptime(date, '%Y-%m-%d')
    
    pred = rf_model.predict([[
        datetime_object.year, 
        datetime_object.month, 
        datetime_object.day, 
        encoding_map_store_id[store_id], 
        encoding_map_item_id[item_id]]])
    return pred

In [63]:
print(predict_revenue('WI_3', 'HOUSEHOLD_1_201', '2015-04-12'))

[2.27132452]


