In [2]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
%cd ../

/jupyter-lab/repo/PMGT


# Import Modules

In [3]:
import gzip
import json
import os
from collections import Counter
from datetime import datetime
from functools import partial

import backoff
import numpy as np
import pandas as pd
import requests
import timm
import torch
from joblib import Parallel, delayed
from PIL import Image
from pmgt.datasets import (
    AmazonReviewImageDataset,
    AmazonReviewTextDataset,
    text_collate_fn,
)
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

# Data Preprocessing

[Amazon Review Datasets](https://nijianmo.github.io/amazon/index.html)
- Video Games
- Toys and Games
- Tools and Home Improvement

In [None]:
!wget -P data/VG http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
!wget -P data/TG i://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Toys_and_Games_5.json.gz
!wget -P data/THIi http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Tools_and_Home_Improvement_5.json.gz

## Load Raw Data

In [4]:
data_dir = "./data/VG"
filename = "Video_Games_5.json.gz"
with gzip.open(os.path.join(data_dir, filename)) as f:
    data = [json.loads(l.strip()) for l in tqdm(f)]

df = pd.DataFrame.from_dict(data)
df['reviewDateTime'] = df['unixReviewTime'].map(lambda x: datetime.fromtimestamp(x))
df = df.sort_values(by='reviewDateTime')
len(df)

0it [00:00, ?it/s]

497577

## Download Images

In [159]:
image_root_path = os.path.join(data_dir, "images")
os.makedirs(image_root_path, exist_ok=True)

In [93]:
def _giveup(e):
    return str(e) == "404"


@backoff.on_exception(
    backoff.expo,
    (requests.exceptions.RequestException, requests.exceptions.ConnectionError),
    max_time=30,
    max_tries=5,
    giveup=_giveup,
)
def download_image(filepath, image_url):
    if os.path.exists(filepath):
        return

    try:
        r = requests.get(image_url, stream=True)
    except requests.exceptions.MissingSchema:
        return

    if r.status_code == 404:
        return
    elif r.status_code != 200:
        raise requests.exceptions.RequestException(r.status_code)

    with open(filepath, "wb") as f:
        for chunk in r.iter_content(1024):
            f.write(chunk)


download_list = []
counter = Counter()

for index, row in df[~pd.isna(df["image"])].iterrows():
    for i, image_url in enumerate(row["image"]):
        ext = os.path.splitext(image_url)[1]
        item_id = row["asin"]
        filepath = os.path.join(image_root_path, item_id, f"{counter[item_id]}{ext}")
        counter[item_id] += 1
        download_list.append((filepath, image_url))

        if not os.path.exists(os.path.dirname(filepath)):
            os.makedirs(os.path.dirname(filepath), exist_ok=True)

Parallel(n_jobs=50, prefer="threads")(
    delayed(download_image)(f, u) for f, u in tqdm(download_list)
)

print(len(download_list))
print(len(df["asin"].unique()))
print(len(next(os.walk(image_root_path))[1]))

  0%|          | 0/96514 [00:00<?, ?it/s]

96514
73649
21282


## Split Data

In [5]:
criterion = datetime(2015, 1, 1, 9)
df1 = df[df['reviewDateTime'] < criterion]
df2 = df[df['reviewDateTime'] >= criterion]
print(len(df1))
print(len(df2))

279333
218244


## Extract Visual Features

In [166]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [331]:
model = timm.create_model("inception_v4", pretrained=True)
config = resolve_data_config({}, model=model)
transform = create_transform(**config)
dataset = AmazonReviewImageDataset(
    image_root_path, transforms=transform, item_ids=df1["asin"].unique()
)

dataloader = DataLoader(dataset, batch_size=32, num_workers=8)

model.cuda()
model.eval()

visual_feats = []
for batch_x in tqdm(dataloader, total=len(dataloader)):
    batch_x = batch_x.cuda()
    with torch.no_grad():
        feat = model.global_pool(model.forward_features(batch_x))
        visual_feats.append(feat.cpu())

visual_feats = torch.cat(visual_feats)

item_visual_feats = []
start = 0
for num in tqdm(dataset.num_images.values()):
    end = start + num
    item_visual_feats.append(visual_feats[start:end].mean(dim=0))
    start = end
item_visual_feats = torch.stack(item_visual_feats).numpy()
item_mapping = np.array([item_id for item_id in dataset.num_images.keys()])

np.savez(
    os.path.join(data_dir, "visual_feats.npz"),
    feats=item_visual_feats,
    mapping=item_mapping,
)

  0%|          | 0/139 [00:02<?, ?it/s]

  0%|          | 0/1307 [00:00<?, ?it/s]

## Extract Textual Features

In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [7]:
review_text = (
    df1[~pd.isna(df1["reviewText"])]
    .groupby("asin")
    .apply(lambda r: r["reviewText"].values)
)
review_text = review_text.to_dict()

dataset = AmazonReviewTextDataset(review_text)
model_name = "bert-base-uncased"

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.cuda()
model.eval()

dataloader = DataLoader(
    dataset,
    batch_size=64,
    num_workers=16,
    collate_fn=partial(text_collate_fn, tokenizer=tokenizer),
)

text_feats = []

for batch_x in tqdm(dataloader, total=len(dataloader)):
    batch_x = {k: v.cuda() for k, v in batch_x.items()}
    with torch.no_grad():
        text_feats.append(model(**batch_x)[0][:, 0].cpu())

text_feats = torch.cat(text_feats)

item_textual_feats = []
start = 0
for num in tqdm(dataset.num_texts.values()):
    end = start + num
    item_textual_feats.append(text_feats[start:end].mean(dim=0))
    start = end
item_textual_feats = torch.stack(item_textual_feats).numpy()
item_mapping = np.array([item_id for item_id in dataset.num_texts.keys()])

np.savez(
    os.path.join(data_dir, "textual_feats.npz"),
    feats=item_textual_feats,
    mapping=item_mapping,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/4365 [00:01<?, ?it/s]

  0%|          | 0/14507 [00:00<?, ?it/s]