In [211]:
import os
import pandas as pd

In [212]:
# main directory for products data
dir = '../datalake/curated_zone/products/'

In [213]:
# method to get all files in a directory and its subdirectories

def get_all_files(
        dir: str
) -> pd.DataFrame:
    """
    Get all files in a directory and its subdirectories

    Args:
        dir(str): The main directory to search

    Return:
        pd.Dataframe: A dataframe wich contains all the files from 
        the subdirectories
    """

    all_dataframes = []

    # Walk through the directory tree
    for root, dirs, files in os.walk(dir):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                # Assuming each JSON file contains a DataFrame, read it
                try:
                    df = pd.read_json(file_path)
                    all_dataframes.append(df)

                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    
    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(
        all_dataframes, 
        ignore_index=True
    )

    return combined_df

In [214]:
# call methond to get all files in the directory
df = get_all_files(dir)

In [215]:
# get data types of the dataframe
df.dtypes

id                        int64
title                    object
price                   float64
description              object
category                 object
image                    object
rating.rate             float64
rating.count              int64
load_date                object
category_tokenize        object
title_tokenize           object
description_tokenize     object
category_stemm           object
title_stemm              object
description_stemm        object
dtype: object

In [216]:
# get unique load_dates
unique_load_dates_df = sorted(df['load_date'].unique(), reverse=True)

print(pd.DataFrame({'Unique Load Dates': unique_load_dates_df}))

  Unique Load Dates
0        2024-01-21


In [217]:
# rename attributes
df = df.rename({ 
    'rating.rate': 'rating_rate', 
    'rating.count': 'rating_count'
  }, axis=1
)

In [218]:
# drop not needed attributes
df = df.drop([
    'category',
    'title', 
    'description',
  ], axis=1
)

In [219]:
# rearanage attributes
df = df[
    [
      'load_date',
      'id',
      'price',
      'rating_rate',
      'rating_count',
      'category_tokenize',
      'title_tokenize',
      'description_tokenize',
      'category_stemm',
      'title_stemm',
      'description_stemm'
    ]
]

In [220]:
# get first 5 rows of the dataframe
df.head(5)

Unnamed: 0,load_date,id,price,rating_rate,rating_count,category_tokenize,title_tokenize,description_tokenize,category_stemm,title_stemm,description_stemm
0,2024-01-21,1,109.95,3.9,120,"[men, clothing]","[fjallraven, foldsack, backpack, fits, laptops]","[perfect, pack, everyday, use, walks, forest, ...","[men, cloth]","[fjallraven, foldsack, backpack, fit, laptop]","[perfect, pack, everyday, use, walk, forest, s..."
1,2024-01-21,2,22.3,4.1,259,"[men, clothing]","[mens, casual, premium, slim, fit]","[style, contrast, raglan, long, sleeve, henley...","[men, cloth]","[men, casual, premium, slim, fit]","[style, contrast, raglan, long, sleev, henley,..."
2,2024-01-21,3,55.99,4.7,500,"[men, clothing]","[mens, cotton, jacket]","[great, outerwear, jackets, suitable, many, oc...","[men, cloth]","[men, cotton, jacket]","[great, outerwear, jacket, suitabl, mani, occa..."
3,2024-01-21,4,15.99,2.1,430,"[men, clothing]","[mens, casual, slim, fit]","[color, could, slightly, different, screen, pr...","[men, cloth]","[men, casual, slim, fit]","[color, could, slightli, differ, screen, pract..."
4,2024-01-21,5,695.0,4.6,400,[jewelery],"[john, hardy, women, legends, naga, gold, silv...","[legends, collection, naga, inspired, mythical...",[jeweleri],"[john, hardi, women, legend, naga, gold, silve...","[legend, collect, naga, inspir, mythic, water,..."


In [221]:
# get last 5 rows of the dataframe
df.tail(5)

Unnamed: 0,load_date,id,price,rating_rate,rating_count,category_tokenize,title_tokenize,description_tokenize,category_stemm,title_stemm,description_stemm
15,2024-01-21,16,29.95,2.9,340,"[women, clothing]","[lock, love, women, removable, hooded, faux, l...","[polyurethane, shell, polyester, lining, polye...","[women, cloth]","[lock, love, women, remov, hood, faux, leather...","[polyurethan, shell, polyest, line, polyest, c..."
16,2024-01-21,17,39.99,3.8,679,"[women, clothing]","[rain, jacket, women, windbreaker, striped, cl...","[lightweight, perfet, trip, casual, wear, slee...","[women, cloth]","[rain, jacket, women, windbreak, stripe, climb...","[lightweight, perfet, trip, casual, wear, slee..."
17,2024-01-21,18,9.85,4.7,130,"[women, clothing]","[mbj, women, solid, short, sleeve, boat, neck, v]","[rayon, spandex, made, usa, imported, bleach, ...","[women, cloth]","[mbj, women, solid, short, sleev, boat, neck, v]","[rayon, spandex, made, usa, import, bleach, li..."
18,2024-01-21,19,7.95,4.5,146,"[women, clothing]","[opna, women, short, sleeve, moisture]","[polyester, machine, wash, cationic, polyester...","[women, cloth]","[opna, women, short, sleev, moistur]","[polyest, machin, wash, cation, polyest, inter..."
19,2024-01-21,20,12.99,3.6,145,"[women, clothing]","[danvouy, womens, shirt, casual, cotton, short]","[spandex, features, casual, short, sleeve, let...","[women, cloth]","[danvouy, women, shirt, casual, cotton, short]","[spandex, featur, casual, short, sleev, letter..."


In [222]:
# check for null values
df.isnull().sum()

load_date               0
id                      0
price                   0
rating_rate             0
rating_count            0
category_tokenize       0
title_tokenize          0
description_tokenize    0
category_stemm          0
title_stemm             0
description_stemm       0
dtype: int64