# FastT5


In [6]:
def identify_string_type(s):
    """
    give the unique know of a paper
    """
    pattern1 = r"^\d+\.\d+$"
    pattern2 = r"^[a-zA-Z]+\/\w+$"

    if re.match(pattern1, s):
        return f"ArXiv:{s}"
    elif re.match(pattern2, s):
        return f"ArXiv:{s}"
    else:
        return s
def read_jsonl(path):
    try:
        with open(path,'r') as f:
            data = [json.load(f)]
    except json.JSONDecodeError:
        with open(path,'r') as f:
            data = []
            for line in f:
                data.append(json.loads(line))
    return data

def structure_dict(_dict):
    paper_id = _dict['paper_id']
    abstract = _dict['abstract']
    key_words= _dict['question']
    pattern = r'\d+\.\s*(.*?)(?=\s*\d+\.\s*|$)'
    words   = re.findall(pattern, key_words)
    if len(words)<1:
        words = key_words.split(',')
    return [paper_id] + [abstract] + words[:10]
import os
ROOTPATH='data/unarXive/'
all_json_list = []
for level_1_name in os.listdir(ROOTPATH):
    level_1_path = os.path.join(ROOTPATH, level_1_name)
    for level_2_name in os.listdir(level_1_path):
        level_2_path = os.path.join(level_1_path, level_2_name)
        all_json_list.append(level_2_path)

In [4]:
from tqdm.notebook import tqdm

all_data = [] 
for path in tqdm(all_json_list):
    all_data+=read_jsonl(path)

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [2]:
import torch
import json
import argparse

from fastchat.utils import is_partial_stop, is_sentence_complete, get_context_length

from fastchat.model.model_adapter import (
    load_model,get_model_adapter,
    get_conversation_template,
)
from fastchat.serve.inference import *
from typing import Iterable, Optional, Dict

In [35]:
def generate_result(
    model,
    tokenizer,
    params: Dict,
    device:str,
    context_len: int,
    stream_interval: int = 2,
):
    # Read parameters
    prompt = params["prompt"]
    len_prompt = len(prompt)
    temperature = float(params.get("temperature", 1.0))
    repetition_penalty = float(params.get("repetition_penalty", 1.0))
    top_p = float(params.get("top_p", 1.0))
    top_k = int(params.get("top_k", -1))  # -1 means disable
    max_new_tokens = int(params.get("max_new_tokens", 256))
    echo = bool(params.get("echo", True))
    stop_str = params.get("stop", None)
    stop_token_ids = params.get("stop_token_ids", None) or []
    stop_token_ids.append(tokenizer.eos_token_id)

    logits_processor = prepare_logits_processor(
        temperature, repetition_penalty, top_p, top_k
    )

    input_ids = tokenizer(prompt).input_ids
    output_ids = list(input_ids)

    if model.config.is_encoder_decoder:
        max_src_len = context_len
    else:  # truncate
        max_src_len = context_len - max_new_tokens - 8

    input_ids = input_ids[-max_src_len:]
    input_echo_len = len(input_ids)

    if model.config.is_encoder_decoder:
        encoder_output = model.encoder(
            input_ids=torch.as_tensor([input_ids], device=device)
        )[0]
        start_ids = torch.as_tensor(
            [[model.generation_config.decoder_start_token_id]],
            dtype=torch.int64,
            device=device,
        )

    past_key_values = out = None
    sent_interrupt = False
    for i in range(max_new_tokens):
        if i == 0:  # prefill
            if model.config.is_encoder_decoder:
                out = model.decoder(
                    input_ids=start_ids,
                    encoder_hidden_states=encoder_output,
                    use_cache=True,
                )
                logits = model.lm_head(out[0])
            else:
                out = model(torch.as_tensor([input_ids], device=device), use_cache=True)
                logits = out.logits
            past_key_values = out.past_key_values
        else:  # decoding
            if model.config.is_encoder_decoder:
                out = model.decoder(
                    input_ids=torch.as_tensor(
                        [[token] if not sent_interrupt else output_ids], device=device
                    ),
                    encoder_hidden_states=encoder_output,
                    use_cache=True,
                    past_key_values=past_key_values if not sent_interrupt else None,
                )
                sent_interrupt = False

                logits = model.lm_head(out[0])
            else:
                out = model(
                    input_ids=torch.as_tensor(
                        [[token] if not sent_interrupt else output_ids], device=device
                    ),
                    use_cache=True,
                    past_key_values=past_key_values if not sent_interrupt else None,
                )
                sent_interrupt = False
                logits = out.logits
            past_key_values = out.past_key_values

        if logits_processor:
            if repetition_penalty > 1.0:
                tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
            else:
                tmp_output_ids = None
            last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
        else:
            last_token_logits = logits[0, -1, :]


        if temperature < 1e-5 or top_p < 1e-8:  # greedy
            _, indices = torch.topk(last_token_logits, 2)
            tokens = [int(index) for index in indices.tolist()]
        else:
            probs = torch.softmax(last_token_logits, dim=-1)
            indices = torch.multinomial(probs, num_samples=2)
            tokens = [int(token) for token in indices.tolist()]
        token = tokens[0]
        output_ids.append(token)

        if token in stop_token_ids:
            stopped = True
        else:
            stopped = False
        if stopped: break

    tmp_output_ids = output_ids[input_echo_len:]
    rfind_start = 0

    output = tokenizer.decode(
        tmp_output_ids,
        skip_special_tokens=True,
        spaces_between_special_tokens=False,
        clean_up_tokenization_spaces=True,
    )
    return output

model_path = 'pretrain_weights/fast_t5'
model, tokenizer = load_model(
    model_path,
    "cuda",
    1,
    load_8bit=False
)
context_len = get_context_length(model.config)

In [1]:
import pandas as pd
import h5py

In [2]:
whole_sentense = pd.read_csv("data/unarXive.clear/unarXive.clear.sections.csv")

In [18]:
print(whole_sentense.iloc[249]['section'])

Input Flipping. We flip an image along its horizontal axis. 


In [5]:
lengths = [len(l.split()) for l in whole_sentense['section']]

In [24]:
from mltool.visualization import *

In [22]:
sum((length>128) * (length<2048))

3523960

In [8]:
import numpy as np

In [9]:
length = np.array(lengths)

In [10]:
sum(length>800)

30076

In [5]:
print("loading csv files...........")
sentense_ids = pd.read_csv("data/unarXive.clear/unarXive.clear.sections.id.csv")
print("done~!")

loading csv files...........
done~!


In [10]:
hdf5_file = 'data/unarXive.clear/unarXive.clear.sections.h5'
f = h5py.File(hdf5_file, 'r')

In [13]:
sentenses = []
for _id in range(1000):
    data  = sentense_ids.iloc[_id]
    paper_id   = data['paper_id']
    sentence_id= data['section_num']
    sentense = f.get(f'{paper_id}/{sentence_id}')[()].decode('utf-8')
    sentenses.append(sentense)

In [18]:
import time

In [21]:
from tqdm.notebook import tqdm

In [100]:
tokenizer.padding_side='left'
prompts    = [" ".join(get_prompt(i).split()) for i in range(500,500+3)]
input_ids_f= tokenizer(prompts).input_ids
inputs_ids = tokenizer.pad({'input_ids': input_ids_f},padding='longest',
                            max_length=512,pad_to_multiple_of=8,return_attention_mask=False,
                            return_tensors='pt').input_ids.cuda()

In [115]:
output_ids.cpu().numpy().tolist()

[[0, 7185, 5, 32103, 1]]

In [142]:
def get_prompt(_id,max_length=128):
    data  = sentense_ids.iloc[_id]
    paper_id   = data['paper_id']
    sentence_id= data['section_num']
    sentense = f.get(f'{paper_id}/{sentence_id}')[()].decode('utf-8')
    sentense = " ".join(sentense.split(" ")[:max_length])
    conv = get_conversation_template(model_path)
    qs = f"""Read below sentence and tell me its type. The answer should be one word and is one of type from ['Author List', 'Reference List', 'Content']. There is the sentence \"{sentense}\" """
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    return prompt

In [144]:
start = time.time()
B = 1
idxes = range(5000,5100)
outputs = []
for _id in tqdm(idxes):
    tokenizer.padding_side='left'
    prompts    = [" ".join(get_prompt(i).split()) for i in range(B*_id,B*(_id+1))]
    input_ids_f= tokenizer(prompts).input_ids
    inputs_ids = tokenizer.pad({'input_ids': input_ids_f},padding='longest',
                                max_length=512,pad_to_multiple_of=8,return_attention_mask=False,
                                return_tensors='pt').input_ids.cuda()
    with torch.no_grad():
        output_ids = model.generate(inputs_ids,max_length=1)
    outputs.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))
end = time.time()
cost= (end - start)/(B*len(idxes))
print(cost)

  0%|          | 0/100 [00:00<?, ?it/s]

0.213057804107666


In [None]:
# import pandas as pd
# from tqdm.notebook import tqdm
# import pandas as pd
# import h5py

# sentenses = pd.read_csv("data/unarXive.clear/unarXive.clear.sections.csv")

# lengths   = [len(l.split()) for l in sentenses['section']]
# sentenses['length'] = lengths
# sentenses = sentenses[sentenses['length']>100]
# sentenses = sentenses[sentenses['length']<1000]

# df = sentenses[['paper_id','section_num']]
# df.to_csv("data/unarXive.clear/unarXive.clear.sections.good.id.csv", index=True)

In [24]:
len(df)

4755699

In [11]:
sentenses = []
for _id in range(1000):
    data  = df.iloc[_id]
    paper_id   = data['paper_id']
    sentence_id= data['section_num']
    sentense = f.get(f'{paper_id}/{sentence_id}')[()].decode('utf-8')
    sentenses.append(sentense)

In [23]:
sentenses[20]

'Next, we study RMSE of SSH for different antenna spacing in the presence of intense water vapor. We set SNR = 5 dB, DoA = 60o, and water vapor density to $10 \\frac{g}{m^3}$ . As Fig. REF demonstrates SSH with wider antenna spacing performs significantly better in the presence of intense water vapor. Nonetheless, as range increases the performance of SSH regardless of antenna spacing deterioration due to the harsh frequency selective attenuation of the channel. Additionally, even in the presence of expensively harsh frequency selective channel, SSH with 5 mm antenna spacing can achieve better than 3o accuracy at the range of 1800 m. [Figure.16 of ArXiv:2108.04932]'

In [1]:
# df = sentenses

# hdf5_file = 'data/unarXive.clear/unarXive.clear.sections.good.h5'
# with h5py.File(hdf5_file, 'w') as f:
#     for _, row in tqdm(df.iterrows(),total = len(df)):
#         paper_id    = row['paper_id']
#         sentence_id = row['section_num']
#         sentence    = row['section']

#         # Create a group for each paper_id if it doesn't exist
#         if str(paper_id) not in f:
#             f.create_group(str(paper_id))

#         # Store the sentence as a dataset
#         f[f'{paper_id}'].create_dataset(str(sentence_id), data=sentence)

# Vicuna

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [9]:
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM,LlamaForCausalLM
from fastchat.conversation import get_conv_template

In [4]:
model = LlamaForCausalLM.from_pretrained(
        "pretrain_weights/vicuna-7b-v1.1", low_cpu_mem_usage=True, torch_dtype=torch.float16
).cuda()

pretrain_weights/vicuna-7b-v1.1
pretrain_weights/vicuna-7b-v1.1/pytorch_model.bin
False
pretrain_weights/vicuna-7b-v1.1/pytorch_model.bin.index.json
True


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from art.bits4llama.llama import load_quant

In [4]:
device = f"cuda:0"

In [5]:
model = load_quant("pretrain_weights/vicuna-7b-v1.1", "pretrain_weights/vicuna-7b-v1.1/vicuna7b-4bit-128g.pt", 4, 128,device=device).to(device)

Loading model ...
Found 3 unique KN Linear values.
Warming up autotune cache ...


100%|██████████| 12/12 [00:29<00:00,  2.49s/it]


Found 1 unique fused mlp KN values.
Warming up autotune cache ...


100%|██████████| 12/12 [00:17<00:00,  1.50s/it]


Done.


In [7]:
import transformers

In [10]:
tokenizer = AutoTokenizer.from_pretrained("pretrain_weights/vicuna-7b-v1.1", use_fast=False)
conv = get_conv_template('vicuna_v1.1').copy()
conv.append_message(conv.roles[0], 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. USER: What happens to you if you eat watermelon seeds? ASSISTANT:')
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer([prompt,prompt]).input_ids
input_ids = torch.as_tensor(input_ids).cuda()

In [7]:
label_ids = tokenizer(["Who is the main character of anarcho-capitalists"+tokenizer.eos_token,
                       "Where is the important place of anarcho-capitalists"+tokenizer.eos_token]).input_ids
label_ids = torch.as_tensor(label_ids).cuda()

In [5]:
# from fastchat.model.compression import load_compress_model
# model = load_compress_model(model_path="pretrain_weights/vicuna-7b-v1.1", device=f'cuda:0', torch_dtype=torch.float16)

In [13]:
with torch.no_grad():
    output_ids = model.generate(input_ids,max_length=512)
    print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: What happens to you if you eat watermelon seeds? ASSISTANT: ASSISTANT: Eating watermelon seeds is generally safe, but they can cause some minor discomfort and potential health issues if consumed in large quantities.

The seeds of watermelon are hard and can cause teeth and gum irritation if they are not properly spit out or swallowed. They can also get stuck in the throat and cause discomfort or difficulty swallowing.

Additionally, watermelon seeds contain a small amount of cyanide, which is a toxic compound. However, the amount of cyanide in watermelon seeds is very small, and the body is able to metabolize and eliminate it quickly.

It is important to note that consu

In [13]:
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))
#print(tokenizer.decode(output_ids[1], skip_special_tokens=True))

A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: What happens to you if you eat watermelon seeds? ASSISTANT: ASSISTANT: Eating watermelon seeds is generally safe for most people. However, there are a few potential side effects that some people may experience.

One potential side effect of eating watermelon seeds is digestive discomfort, such as gas and bloating. This is because watermelon seeds contain oils that can irritate the digestive system and cause these symptoms.

Another potential side effect of eating watermelon seeds is choking. This is because the seeds can get stuck in the throat or esophagus and cause difficulty swallowing or breathing.

It's important to note that these side effects are generally mild a