In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
%cd ../

/jupyter-lab/repo/PMGT


# Import Modules

In [118]:
import gzip
import json
import os
from collections import Counter

import backoff
import numpy as np
import pandas as pd
import requests
import timm
import torch
from joblib import Parallel, delayed
from PIL import Image
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from tqdm.auto import tqdm

# Data Preprocessing

[Amazon Review Datasets](https://nijianmo.github.io/amazon/index.html)
- Video Games
- Toys and Games
- Tools and Home Improvement

In [None]:
!wget -P data/VG http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
!wget -P data/TG http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Toys_and_Games_5.json.gz
!wget -P data/THI http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Tools_and_Home_Improvement_5.json.gz

## Load Raw Data

In [103]:
data_dir = "./data/VG"
filename = "Video_Games_5.json.gz"
with gzip.open(os.path.join(data_dir, filename)) as f:
    data = [json.loads(l.strip()) for l in tqdm(f)]

df = pd.DataFrame.from_dict(data)
len(df)

0it [00:00, ?it/s]

497577

## Download Images

In [93]:
def _giveup(e):
    return str(e) == "404"


@backoff.on_exception(
    backoff.expo,
    (requests.exceptions.RequestException, requests.exceptions.ConnectionError),
    max_time=30,
    max_tries=5,
    giveup=_giveup,
)
def download_image(filepath, image_url):
    if os.path.exists(filepath):
        return

    try:
        r = requests.get(image_url, stream=True)
    except requests.exceptions.MissingSchema:
        return

    if r.status_code == 404:
        return
    elif r.status_code != 200:
        raise requests.exceptions.RequestException(r.status_code)

    with open(filepath, "wb") as f:
        for chunk in r.iter_content(1024):
            f.write(chunk)


image_root_path = os.path.join(data_dir, "images")
os.makedirs(image_root_path, exist_ok=True)

download_list = []
counter = Counter()

for index, row in df[~pd.isna(df["image"])].iterrows():
    for i, image_url in enumerate(row["image"]):
        ext = os.path.splitext(image_url)[1]
        item_id = row["asin"]
        filepath = os.path.join(image_root_path, item_id, f"{counter[item_id]}{ext}")
        counter[item_id] += 1
        download_list.append((filepath, image_url))

        if not os.path.exists(os.path.dirname(filepath)):
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

Parallel(n_jobs=50, prefer="threads")(
    delayed(download_image)(f, u) for f, u in tqdm(download_list)
)

print(len(download_list))
print(len(df["asin"].unique()))
print(len(next(os.walk(image_root_path))[1]))

  0%|          | 0/96514 [00:00<?, ?it/s]

96514
73649
21282


## Extract Visual Features

In [135]:
model = timm.create_model("inception_v4", pretrained=True)
config = resolve_data_config({}, model=model)
transform = create_transform(**config)
_ = model.eval()

In [142]:
item_id = os.listdir(image_root_path)[0]
filename = os.listdir(os.path.join(image_root_path, item_id))[0]
filepath = os.path.join(image_root_path, item_id, filename)
img = Image.open(filepath).convert("RGB")
tensor = transform(img).unsqueeze(0)

with torch.no_grad():
    feat = model.forward_features(tensor)
    feat = model.global_pool(feat)
    
feat

tensor([[0.1881, 0.0117, 0.1273,  ..., 0.1038, 0.2942, 0.2342]])