<a href="https://colab.research.google.com/github/wuzekai1998/Python_R-Tree/blob/master/Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install kaggle
!pip install utils
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download yelp-dataset/yelp-dataset
!unzip "yelp-dataset"

Collecting utils
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1
Downloading yelp-dataset.zip to /content
100% 4.07G/4.07G [02:23<00:00, 40.7MB/s]
100% 4.07G/4.07G [02:23<00:00, 30.5MB/s]
Archive:  yelp-dataset.zip
  inflating: Dataset_User_Agreement.pdf  
  inflating: yelp_academic_dataset_business.json  
  inflating: yelp_academic_dataset_checkin.json  
  inflating: yelp_academic_dataset_review.json  
  inflating: yelp_academic_dataset_tip.json  
  inflating: yelp_academic_dataset_user.json  


In [None]:
import json
import pandas as pd
import datetime
import numpy as np
import re
import copy
import scipy
#from scipy.special import logsumexp
from mpmath import mpf
import math
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
pd.set_option('mode.chained_assignment', None)
pd.set_option('max_colwidth', 50)
%matplotlib inline
plt.style.use('fivethirtyeight')

import seaborn as sns
sns.set()
sns.set_context("talk")
#import kmeans
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression as lr
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.neighbors import NearestNeighbors

import networkx as nx

In [None]:
path = ''
reviews = pd.read_json(path+'yelp_academic_dataset_review.json',lines=True,chunksize=100000)
business =pd.read_json(path+'yelp_academic_dataset_business.json',lines=True,chunksize=10000)
checkin = pd.read_json(path+'yelp_academic_dataset_checkin.json',lines=True,chunksize=100000)
tip = pd.read_json(path+'yelp_academic_dataset_tip.json',lines=True,chunksize=100000)
users = pd.read_json(path+'yelp_academic_dataset_user.json',lines=True,chunksize=100000)

In [None]:
FILEPATH_BUSINESS = "yelp_academic_dataset_business.json"
FILEPATH_REVIEW = "yelp_academic_dataset_review.json"
FILEPATH_USER = "yelp_academic_dataset_user.json"

PRETRAINED_MODEL_NAME = "bert-base-uncased"
TRAINED_MODEL_OUTPUT_DIR = "model"

BATCH_SIZE = 16
ENCODER_MAX_LEN = 32
DECODER_MAX_LEN = 128

In [None]:
!pip install datasets==1.5.0
!pip install transformers==4.5.1



In [None]:
import json
import random
from typing import Dict, List

from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    EncoderDecoderConfig, 
    EncoderDecoderModel, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [None]:
from random import randrange

In [None]:
businesses = {}
with open(FILEPATH_BUSINESS, 'r') as f:
    for line in f:
        business = json.loads(line)
        r = randrange(10)
        if r > 0:
            continue
        if business["categories"] and 5 < business["review_count"] < 40:
            # categories is stored as a comma separated str. Convert to a list.
            categories_list = business["categories"].split(", ")
            
            businesses[business["business_id"]] = {
                "name": business["name"],
                "city": business["city"],
                "categories": categories_list
            }

print(f"num businesses: {len(businesses)}")

num businesses: 9886


In [None]:
users = {}
with open(FILEPATH_USER, 'r') as f:
    for line in f:
        user = json.loads(line)
        elite_level = 1 if user["elite"] else 0
        users[user["user_id"]] = {"elite_level": elite_level}

In [None]:
reviews = {"input_text": [], "output_text": []}
with open(FILEPATH_REVIEW, 'r') as f:
    for line in f:
        review = json.loads(line)
        if review["business_id"] in businesses:
            business = businesses[review["business_id"]]
            if review["user_id"] in users:
                user = users[review["user_id"]]
            
                # shuffle categories each time to prevent model from memorizing order
                random.shuffle(business["categories"])
                categories_str = ", ".join(business["categories"])
                    
                input_text = (
                    f"stars {int(review['stars'])}"
                    f"; funny {review['funny']}"
                    f"; elite level {user['elite_level']}"
                    f"; name {business['name']}"
                    f"; city {business['city']}"
                    f"; categories {categories_str}"
                )
                reviews["input_text"].append(input_text)
                
                # trim off excess tokens to reduce memory
                output_tokens = review["text"].split()[:DECODER_MAX_LEN]
                output_text = " ".join(output_tokens)
                reviews["output_text"].append(output_text)

In [None]:
ds = Dataset.from_dict(reviews)
ds = ds.train_test_split(train_size=0.95)
train_ds, val_ds = ds["train"], ds["test"]
ds

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 146915
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 7733
    })
})

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["input_text"], padding="max_length", truncation=True, max_length=ENCODER_MAX_LEN
    )
    outputs = tokenizer(
        batch["output_text"], padding="max_length", truncation=True, max_length=DECODER_MAX_LEN
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels] 
        for labels in batch["labels"]
    ]

    return batch

In [None]:
train_ds = train_ds.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=64,
    remove_columns=["input_text", "output_text"]
)

train_ds.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_ds = val_ds.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=64,
    remove_columns=["input_text", "output_text"]
)

val_ds.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

HBox(children=(FloatProgress(value=0.0, max=2296.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=121.0), HTML(value='')))




In [None]:
enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    PRETRAINED_MODEL_NAME, PRETRAINED_MODEL_NAME
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer

In [None]:
# set special tokens
enc_dec_model.config.decoder_start_token_id = tokenizer.bos_token_id
enc_dec_model.config.eos_token_id = tokenizer.eos_token_id
enc_dec_model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
enc_dec_model.config.vocab_size = enc_dec_model.config.decoder.vocab_size
enc_dec_model.config.max_length = DECODER_MAX_LEN
enc_dec_model.config.no_repeat_ngram_size = 3
enc_dec_model.config.early_stopping = True
enc_dec_model.config.length_penalty = 2.0
enc_dec_model.config.top_p = 0.95
enc_dec_model.config.do_sample = True

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir=TRAINED_MODEL_OUTPUT_DIR,
    save_total_limit=2,
    overwrite_output_dir=True,
    save_steps=5000,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="steps",
    logging_steps=5000,
    logging_first_step=True,
    warmup_ratio=0.05,
    num_train_epochs=1,
    fp16=True
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=enc_dec_model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Runtime,Samples Per Second
5000,3.955,3.529369,52.3953,147.589


TrainOutput(global_step=9183, training_loss=3.7771916673500967, metrics={'train_runtime': 5008.9483, 'train_samples_per_second': 1.833, 'total_flos': 3.48877361800224e+16, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 990154240, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -1884688384, 'train_mem_gpu_alloc_delta': 3054872576, 'train_mem_cpu_peaked_delta': 1887121408, 'train_mem_gpu_peaked_delta': 3048481280})

In [None]:
def generate_reviews(test_ds: Dataset, decoder_max_length: int = DECODER_MAX_LEN) -> Dataset:
    def generate_reviews_batch(batch):
        # Tokenizer will automatically set [BOS] <text> [EOS]
        inputs = tokenizer(
            batch["input_text"], padding="max_length", truncation=True, max_length=ENCODER_MAX_LEN, return_tensors="pt"
        )
        input_ids = inputs.input_ids.to("cuda")
        attention_mask = inputs.attention_mask.to("cuda")
        outputs = enc_dec_model.generate(
            input_ids, attention_mask=attention_mask, max_length=decoder_max_length
        )
        
        batch["generated_reviews"] = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        return batch
    
    results = test_ds.map(generate_reviews_batch, batched=True)

    return results["generated_reviews"]

In [None]:
def build_input(
        stars: int,
        name: str, 
        city: str, 
        categories: List[str],
        funny: int = 50, 
        elite_level: int = 0
) -> str:
    """Builds an input string for a single example from the given features."""
    categories_str = ", ".join(categories)
    return (
        f"stars {stars}"
        f"; funny {funny}"
        f"; elite level {elite_level}"
        f"; name {name}"
        f"; city {city}"
        f"; categories {categories_str}"
    )

In [38]:
test_input_text = {
    "input_text": [
        build_input(
            stars=star, name="Krusty Burger", city="Springfield", 
            categories=["Burgers", "Fast Food"], elite_level=0
        ) 
        for star in range(1, 6)
    ]
}
test_ds = Dataset.from_dict(test_input_text)

generate_reviews(test_ds)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




['when i first walked in, there were no employees waiting in line even after the lady working was so friendly. there was another staff, i ordered something and the girl came back back with my order. i took it home and i waited another twenty minutes and got there. i waited for a while ( if this is the case now ) and my order was never given again. she just walked out and took the food inside the door. i asked the waitress how the order was and then she apologized! i was told she had forgotten my order and to go inside. i will never go back!',
 "i have a hard time not wanting to get some good fries at this burger king. i've gone to drive in and get a burger that was pretty decent. as i am taking a break from my drive up and the place is pretty empty. i don't like getting food on the menu, but the people working here are really polite and knowledgeable. i didn't really know the name of the place, but it's obvious it'll be easier, especially since there isn't a lot of one staff there. i r

In [41]:
test_input_text = {
    "input_text": [
        build_input(
            stars=star, name="Krusty Burger", city="Springfield", 
            categories=["Chinese", "Fast Food"], elite_level=1
        ) 
        for star in range(1, 6)
    ]
}
test_ds = Dataset.from_dict(test_input_text)

generate_reviews(test_ds)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




["terrible at times. it is terrible! i got chicken and rice with rice and beans beans. they asked for chicken and we didn't get any. so i had to wait at least 3 hours for the waitress to let me take my order to wait more before they came out. now, there is a girl there that works. the manager was just rude. she was in the middle of a class game and that girl was trying to get away at her job and never getting her food wrong. my son was driving up there for the night and he wanted to pay for his food! she went to the bathroom and asked if",
 'i\'ve only tried this one when there are more options and there is no one near me or me except for me. i\'m not sure what the first time i went to the " last visit " so it is pretty disappointing that this particular location did have a lot of experience to make a large stop. the restaurant has the usual food that could get but some of the employees are not friendly too. the only thing i hate about this place is the food. however the menu is not ri