In [1]:
import pandas as pd
import os
from os import path as osp
from glob import glob

In [2]:
osp.abspath(os.curdir)

'/Users/uyen/simcel/simcel/exploration'

In [3]:
ROOT_DIR = '..' # change this
DATA_PATH = osp.join(ROOT_DIR, 'data', 'raw')

print(ROOT_DIR)
print(DATA_PATH)

..
../data/raw


### Read raw data

In [15]:
DATA_PATH = osp.join(ROOT_DIR, 'data')

raw_data_file = osp.abspath(osp.join(DATA_PATH, 'simcel-6pk70-1jk5iqdp-train_v9rqX0R.csv'))
df = pd.read_csv(raw_data_file)

df.sample(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8182,DRF27,8.93,Low Fat,0.028533,Dairy,151.434,OUT018,2009,Medium,Tier 3,Supermarket Type2,1225.072
7631,FDH45,15.1,Regular,0.105667,Fruits and Vegetables,41.6796,OUT046,1997,Small,Tier 1,Supermarket Type1,495.3552
4747,FDG33,5.365,Regular,0.140458,Seafood,169.7764,OUT049,1999,Medium,Tier 1,Supermarket Type1,3263.7516
3141,FDH16,10.5,Low Fat,0.052637,Frozen Foods,88.583,OUT049,1999,Medium,Tier 1,Supermarket Type1,808.947
7991,FDC60,5.425,Regular,0.114472,Baking Goods,88.3514,OUT046,1997,Small,Tier 1,Supermarket Type1,2833.6448


In [5]:
len(df)

8523

In [6]:
print(df.columns), len(df.columns)

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


(None, 12)

In [28]:
df.describe(include=['float'])

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,2181.288914
std,4.643456,0.051598,62.275067,1706.499616
min,4.555,0.0,31.29,33.29
25%,8.77375,0.026989,93.8265,834.2474
50%,12.6,0.053931,143.0128,1794.331
75%,16.85,0.094585,185.6437,3101.2964
max,21.35,0.328391,266.8884,13086.9648


In [29]:
df.describe(include=['int'])

Unnamed: 0,Outlet_Establishment_Year
count,8523.0
mean,1997.831867
std,8.37176
min,1985.0
25%,1987.0
50%,1999.0
75%,2004.0
max,2009.0


In [30]:
df.describe(include=['object'])

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


In [7]:
attrs = df.columns.values # all attribute in raw data
categorized_attrs = ['Item_Fat_Content', 'Item_Type', 'Outlet_Establishment_Year', 'Outlet_Type'] # all categorized fields in raw data
outlet_attrs = ['Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type']
item_attrs = ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Type'] # Item attributes 
cross_attrs = ['Item_MRP', 'Item_Visibility', 'Item_Outlet_Sales']

### Preprocessing & ETL

In [41]:
# Check if every attribute wrote in save format

for col in df.columns:
    if col in categorized_attrs:
        print(col, df[col].unique())

Item_Fat_Content ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
Item_Type ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
Outlet_Establishment_Year [1999 2009 1998 1987 1985 2002 2007 1997 2004]
Outlet_Type ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']


In [43]:
# We can see some value is being written in different formats, for example: 'Low Fat', 'low fat', 'LF'
# then we now synchronize them: 
df.Item_Fat_Content.replace(to_replace='low fat', value='Low Fat', inplace=True)
df.Item_Fat_Content.replace(to_replace='LF', value='Low Fat', inplace=True)
df.Item_Fat_Content.replace(to_replace='reg', value='Regular', inplace=True)

#### Outlets

In [44]:
outlets_unique = df.Outlet_Identifier.unique()

df_outlets = df[outlet_attrs]
df_outlets = df_outlets.drop_duplicates(subset=['Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Type', 'Outlet_Location_Type'])

assert len(outlets_unique) == len(df_outlets)

outlets_data_path = osp.join(DATA_PATH, 'outlets.csv')
df_outlets.to_csv(outlets_data_path, index=None)

#### Items

In [129]:
# item_attrs = ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Type'] # Item attributes 

df_items = df[item_attrs]
grouped_items = df_items.groupby('Item_Identifier').groups
_dfs = pd.DataFrame()

for i in grouped_items:
    _df = df_items.iloc[grouped_items[i]]
    _df = _df.sort_values(by='Item_Weight')
    _df.ffill(inplace=True)
    _dfs = pd.concat([_dfs, _df.head(1)])

items_data_path = osp.join(DATA_PATH, 'items.csv')
_dfs.to_csv(items_data_path, index=None)

In [130]:
df.columns.values # all attribute in raw data

array(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content',
       'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'], dtype=object)

In [133]:
df_cleaned = df[['Item_Identifier', 'Outlet_Identifier', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']]
df_cleaned.sample(5)

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Visibility,Item_MRP,Item_Outlet_Sales
1280,FDI04,OUT019,0.12766,198.5426,790.9704
7007,FDC16,OUT017,0.020686,85.054,1731.08
3572,FDR51,OUT045,0.173822,151.4708,2407.5328
5089,FDT59,OUT045,0.015944,231.9668,3225.1352
5019,FDX20,OUT018,0.042734,226.972,3848.324


In [134]:
cleaned_data_path = osp.join(DATA_PATH, 'simcel.csv')
df_cleaned.to_csv(cleaned_data_path, index=None)

### Let's try stock data

In [204]:
import yfinance as yf

msft = yf.Ticker("nvda")
