In [4]:
import gzip
import shutil
import pandas as pd
import os

In [10]:
def get_review_data(file_id, category):
  file_url = f"https://drive.google.com/uc?id={file_id}"
  output_file = f"{category}_review.jsonl"  # 저장할 파일 이름
  gdown.download(file_url, output_file, quiet=False)
  return output_file

# download from google drive and unzip file
def get_meta_data(file_id, category):
  file_url = f"https://drive.google.com/uc?id={file_id}"
  zipped_file = f"{category}_meta.jsonl.gz"  # 저장할 파일 이름
  gdown.download(file_url, zipped_file, quiet=False)

  input_file = zipped_file  # 압축된 파일
  output_file = f"{category}_meta.jsonl" # 압축 해제된 파일

  # 압축 해제
  with gzip.open(input_file, 'rb') as f_in:
      with open(output_file, 'wb') as f_out:
          shutil.copyfileobj(f_in, f_out)

  print(f"압축 해제 완료: {output_file}")
  return output_file

def FilterCommonKey(review_file_path,meta_file_path):
  reviews_df = pd.read_json(review_file_path, lines=True)
  meta_df = pd.read_json(meta_file_path, lines=True)

  # 2. 필요한 컬럼만 추출
  reviews_df = reviews_df[['asin']].drop_duplicates()  # 중복 제거
  meta_df = meta_df[['parent_asin', 'title', 'main_category']]  # 필요한 컬럼만 유지

  # 3. 공통된 값을 기준으로 필터링
  common_meta_df = meta_df[meta_df['parent_asin'].isin(reviews_df['asin'])]

  # 4. 결과 저장 (Optional)
  # common_meta_df.to_json("filtered_Baby_product_meta.json", orient="records", lines=True)
  return common_meta_df

In [11]:
# In local
def CombineMetaData(data_array):
   combined_meta_df = pd.DataFrame()
   for data in data_array:
        review_file_path = os.path.abspath(f"./amazon_review/{data['category']}_50K.json")
        meta_file_path = os.path.abspath(f"./amazon_meta/meta_{data['category']}.jsonl/meta_{data['category']}.jsonl")
        common_meta_df = FilterCommonKey(review_file_path, meta_file_path)
        combined_meta_df = pd.concat([combined_meta_df, common_meta_df], ignore_index=True)
   
   combined_meta_df.to_json("combined_common_meta.json", orient="records", lines=True)

# In Colab
def GetTotalMetaData(data_array):
    combined_meta_df = pd.DataFrame()

    for data in data_array:
        review_file_path = get_review_data(data['review_file_id'], data['category'])
        meta_file_path = get_meta_data(data['meta_file_id'], data['category'])

        common_meta_df = FilterCommonKey(review_file_path, meta_file_path)

        combined_meta_df = pd.concat([combined_meta_df, common_meta_df], ignore_index=True)

    # 결합된 DataFrame을 파일로 저장
    combined_meta_df.to_json("combined_common_meta.json", orient="records", lines=True)

In [None]:
data_array = [
    {
      "review_file_id": "1K7YGusyNKYhovzQVvCjKgXc-85snSut2",
      "meta_file_id": "16Jai-R0OiUQTPS5-zq1p2rzTzQcd-nvK",
      "category": "Baby_Products",
    },
    {
      "review_file_id": "1xR5ZvNcv1RZmGy5INIhAUuj1QeB_bBfx",
      "meta_file_id": "1pBu0mRg5P9Itv0hbS8fCqxWmZnmqeuEP",
      "category": "Amazon_Fashion",
    },
    {
      "review_file_id": "12JkkFhHXMe4OBgqEr_0FgntHzohhtD0m",
      "meta_file_id": "1bwOOyvAFiCEbBQaj98hOFXeqEtsYYXFF",
      "category": "Arts_Crafts_and_Sewing",
    },
    {
      "review_file_id": "1I-nUdx-tOBO9aiJLVNjLQ-EhiqWR0R_H",
      "meta_file_id": "1v9ysd8ir3_imQMw8iUn4SajnoFJvDypY",
      "category": "All_Beauty",
    },
    {
      "review_file_id": "1yXKAIS1BO_FCxOgGsx2JOI8xUcqgxtV-",
      "meta_file_id": "1sW_DQp5azj-y6O5XtLq2Ge6LGlEPNhGe",
      "category": "Video_Games",
    },
    {
      "review_file_id": "1T0GwVI7cPzuLjSJ9lurNShSmwIAuP-Sd",
      "meta_file_id": "1u2wi6CYsN2OP-30St7cBxX2VjPB5UQNQ",
      "category": "Automotive",
    },
    {
      "review_file_id": "1x5DvAY4NKsixMBadm1UCrUScJvtU17Dj",
      "meta_file_id": "1pBu0mRg5P9Itv0hbS8fCqxWmZnmqeuEP",
      "category": "Sports_and_Outdoors",
    },
]

# GetTotalMetaData(data_array)
CombineMetaData(data_array)