In [3]:
import sys
import os
from pathlib import Path

# Add src to the module search path
current_dir = os.getcwd()
src_dir = os.path.abspath(os.path.join(current_dir, '..'))

if src_dir not in sys.path:
    sys.path.append(src_dir)

In [4]:
import pandas as pd
import polars as pl
from src.utils.config_loader import config_loader
from src.data_ingestion.loader import DataLoader
import seaborn as sns
import matplotlib.pyplot as plt


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)

In [5]:
ROOTPATH = Path.cwd().parent
RAW_DATA_PATH = ROOTPATH / config_loader.get_data_paths()['raw_data_path']
PROCESSED_DATA_PATH = ROOTPATH / config_loader.get_data_paths()['processed_data_path']

# Dataset Setup: Load, Preprocess, and Save

In [6]:
# Instatiate DataLoader
dataloader = DataLoader()

Run these cells only once

In [7]:
# # Load the raw .csv
# df = dataloader.load_raw_data(RAW_DATA_PATH / 'UserBehavior.csv')

In [8]:
# # Preprocess the raw DataFrame
# df = dataloader.preprocess_data(df)

In [9]:
# # Save preprocessed DataFrame in the format of .parquet
# dataloader.save_preprocessed_parquet(
#     df=df,
#     folder_path=PROCESSED_DATA_PATH,
#     filename='preprocessed.parquet'
# )

In [10]:
df = pl.read_parquet(PROCESSED_DATA_PATH / 'preprocessed.parquet')
# df = pd.read_parquet(PROCESSED_DATA_PATH / 'preprocessed.parquet')

# Exploratory Data Analysis

### (a) Data Inspection

In [11]:
df.head()

user_id,item_id,category_id,behavior_type,timestamp,datetime,hour,day_of_week,date
i64,i64,i64,str,i64,datetime[ns],i32,i32,date
1,2576651,149192,"""pv""",1511572885,2017-11-25 01:21:25,1,5,2017-11-25
1,3830808,4181361,"""pv""",1511593493,2017-11-25 07:04:53,7,5,2017-11-25
1,4365585,2520377,"""pv""",1511596146,2017-11-25 07:49:06,7,5,2017-11-25
1,4606018,2735466,"""pv""",1511616481,2017-11-25 13:28:01,13,5,2017-11-25
1,230380,411153,"""pv""",1511644942,2017-11-25 21:22:22,21,5,2017-11-25


### (b) Summary Statistics

In [12]:
df.describe()

statistic,user_id,item_id,category_id,behavior_type,timestamp,datetime,hour,day_of_week,date
str,f64,f64,f64,str,f64,str,f64,f64,str
"""count""",98914533.0,98914533.0,98914533.0,"""98914533""",98914533.0,"""98914533""",98914533.0,98914533.0,"""98914533"""
"""null_count""",0.0,0.0,0.0,"""0""",0.0,"""0""",0.0,0.0,"""0"""
"""mean""",506866.401576,2579800.0,2696600.0,,1512000000.0,"""2017-11-29 14:36:41.987002""",9.530052,3.682862,"""2017-11-29 04:34:52.154000"""
"""std""",294069.227894,1488100.0,1463200.0,,227051.898454,,5.505344,2.02503,
"""min""",1.0,1.0,80.0,"""buy""",1511600000.0,"""2017-11-25 00:00:00""",0.0,0.0,"""2017-11-25"""
"""25%""",252355.0,1295244.0,1320293.0,,1511800000.0,"""2017-11-27 07:35:16""",5.0,2.0,"""2017-11-27"""
"""50%""",503892.0,2580632.0,2671397.0,,1512000000.0,"""2017-11-29 14:53:26""",10.0,4.0,"""2017-11-29"""
"""75%""",760856.0,3862029.0,4145813.0,,1512200000.0,"""2017-12-02 02:15:28""",14.0,5.0,"""2017-12-02"""
"""max""",1018011.0,5163070.0,5162429.0,"""pv""",1512300000.0,"""2017-12-03 23:52:41""",23.0,6.0,"""2017-12-03"""


### (c) Null Values