In [2]:
!pip install datasets bitsandbytes spacy trl

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12=

In [None]:
!pip install -U bitsandbytes transformers trl

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import load_dataset, Dataset
import torch
import os
import random
import spacy
import json
from tqdm import tqdm

In [18]:
random.seed(10)
torch.cuda.empty_cache()
local_dir = './drive/MyDrive'
model_name = "meta-llama/Llama-2-7b-hf"
sample100rows = True  # 100rows train data

In [3]:
hf_access_token = "hf_aaa"
os.environ["HF_ACCESS_TOKEN"] = hf_access_token

from huggingface_hub import login

# 通过代码输入 Token
login(token=hf_access_token)

In [None]:
# Step1: datasets

In [4]:
dataset = load_dataset("PaulAdversarial/all_news_finance_sm_1h2023")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

all_news.csv:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5062 [00:00<?, ? examples/s]

In [5]:
fields = dataset["train"].features.keys()
fields

dict_keys(['_id', 'main_domain', 'title', 'description', 'created_at'])

In [6]:
dataset["train"][:2]

{'_id': ['6453d70d358e80adbfc4cb2b', '6453cf909a78e3af538abe44'],
 'main_domain': ['cnbc.com', 'cointelegraph.com'],
 'title': ['Dow drops 400 points, turns negative for the year as bank fears grow: Live updates',
  'Bitcoin drops with stocks as analyst warns of banking crisis ‘endgame’'],
 'description': ['Regional banks led the broader market lower as contagion fears resurfaced.',
  'Bitcoin dips as the U.S. banking crisis engulfs more lenders, BTC price falling in line with stocks.'],
 'created_at': ['2023-05-04T16:01:46.448Z', '2023-05-04T15:25:28.809Z']}

In [7]:
# check
from collections import Counter
count = Counter(dataset["train"]["main_domain"])
reversed_count = sorted(count.items(), key=lambda x:x[1], reverse=True)
print(len(reversed_count))
reversed_count[:5]

397


[('bloomberg.com', 675),
 ('reuters.com', 584),
 ('wsj.com', 388),
 ('cointelegraph.com', 362),
 ('theguardian.com', 333)]

In [8]:
# refer to main_domain_class.txt
# domain_class_dict is map from main_domain to class
with open(os.path.join(local_dir, 'main_domain_class.txt'), 'r') as f:
    lines = f.readlines()
print(lines[:3])
domain_class_dict = {}  # k:domain, value:class
for line in lines:
    dom, cls = line.split(' - ')
    domain_class_dict[dom.strip()] = cls.strip()
print(len(domain_class_dict))

['cnbc.com - Finance and Business News\n', 'cointelegraph.com - Cryptocurrency and Blockchain\n', 'co.uk - General News\n']
397


In [9]:
# class_count
from collections import defaultdict
class_count = defaultdict(int)

for item in reversed_count:
    if item[0] not in domain_class_dict:
        print('ERROR:', item[0])
        continue
    else:
        class_count[domain_class_dict[item[0]]] +=  item[1]
class_count

defaultdict(int,
            {'Finance and Business News': 1963,
             'General News': 1815,
             'Cryptocurrency and Blockchain': 1012,
             'Politics and Government': 102,
             'Technology and Innovation': 66,
             'Health and Medicine': 15,
             'Law and Policy Analysis': 8,
             'Real Estate': 6,
             'Environment and Sustainability': 22,
             'Society and Culture': 49,
             'Military and Defense': 4})

In [10]:
class_count.keys()

dict_keys(['Finance and Business News', 'General News', 'Cryptocurrency and Blockchain', 'Politics and Government', 'Technology and Innovation', 'Health and Medicine', 'Law and Policy Analysis', 'Real Estate', 'Environment and Sustainability', 'Society and Culture', 'Military and Defense'])

In [None]:
# As above, choose the top three topics as the evaluate topics.
# 'Finance and Business News': 1963, 'General News': 1815, 'Cryptocurrency and Blockchain': 1012

In [None]:
# Step2: collate data

In [None]:
# example:
# {"prompt": "<prompt text>", "completion": "<ideal generated text>"}

In [11]:
# template
# format 0-words, 1-topic
prompt_list = [
    "Generate a {0}-word article related to {1}",
    "Compose a news article about {1}, with around {0} words.",
    "Draft a news article on the topic of {1}, about {0} words.",
    "Write a {0}-word brief on {1}.",
    "Create a {0}-word summary on {1}.",
    "Produce a {0}-word analysis of {1}.",
    "Generate a news article snippet about {1}, around {0} words.",
    "Write a {0}-word commentary on {1}."
]


In [12]:
data_train = dataset['train']
print(data_train)
print(data_train.features.keys())
for field in data_train.features:
    assert len(data_train[field]) == len(data_train)
    print(field, data_train[field])

Dataset({
    features: ['_id', 'main_domain', 'title', 'description', 'created_at'],
    num_rows: 5062
})
dict_keys(['_id', 'main_domain', 'title', 'description', 'created_at'])
_id ['6453d70d358e80adbfc4cb2b', '6453cf909a78e3af538abe44', '6453cb87ccab8508100df076', '6453afd269f3c1643cf0a4f6', '645399d92471d73ea0976d27', '645392d7ce23fd2c7506a731', '645392d7ce23fd2c7506a730', '64539232a82412570ef28912', '645388f771e800157b64b5d5', '64537f7c90cd64f4e6d7e7b1', '6453722f01285f8f887e28ea', '6453471f81634d372a19d6fd', '64531213aef4b8dc21938f33', '6452f37390713db2f5fe79c9', '6452f1207742e7bff78f09e6', '6452dd8c5a12b5b4335b8bbe', '6452c6f9a4de3100e87534ca', '6452b065fb365e1723d97dac', '645290a755295a194e6a59e3', '64526363d0eeb9a5d5e92510', '645262b8f8fbc14bee553c5a', '645257073147864ef9895986', '645254ffbbdf40027f093c21', '64523a4f60cedbd91d590e14', '64522edf4dee96a241c2b410', '64521e54d5044ab70844b71c', '644f883c33af54b5c08445f0', '644e4487195fc43e8f130111', '644daa0e8c19f2b906eda8c4', '64

In [13]:
import pandas as pd

processed_data = []
for i in range(len(data_train)):
    row_data = []
    for field in data_train.features.keys():
        row_data.append(data_train[field][i])
    processed_data.append(row_data)

df = pd.DataFrame(processed_data, columns=list(data_train.features.keys()))

print(df.head()) #check the data


                        _id        main_domain  \
0  6453d70d358e80adbfc4cb2b           cnbc.com   
1  6453cf909a78e3af538abe44  cointelegraph.com   
2  6453cb87ccab8508100df076              co.uk   
3  6453afd269f3c1643cf0a4f6     bitcoinist.com   
4  645399d92471d73ea0976d27   seekingalpha.com   

                                               title  \
0  Dow drops 400 points, turns negative for the y...   
1  Bitcoin drops with stocks as analyst warns of ...   
2  Bitcoin Price Analysis:  29370 Tested After Su...   
3  Bitcoin Is 75% To Halving, Here's How Past Cyc...   
4  Iron Mountain FFO of $0.71 beats by $0.03, rev...   

                                         description                created_at  
0  Regional banks led the broader market lower as...  2023-05-04T16:01:46.448Z  
1  Bitcoin dips as the U.S. banking crisis engulf...  2023-05-04T15:25:28.809Z  
2  Bitcoin (BTC/USD) sought to add to recent gain...  2023-05-04T15:12:00.971Z  
3  The current Bitcoin cycle is now 75

In [14]:
# the word count of generation text
nlp = spacy.load('en_core_web_sm')
def count_word_punc(text):
    doc = nlp(text)
    words = [token.text for token in doc if token.is_alpha]
    punctuation = [token.text for token in doc if token.is_punct]
    return len(words) + len(punctuation)

In [17]:
# collection of the prompt and generation text.
def compose_sample(sample):
    main_domain = sample['main_domain']
    title = sample['title']
    description = sample['description']
    created_at = sample['created_at']

    counts = count_word_punc(title + description)

    prompt_template = random.choice(prompt_list)
    prompt = prompt_template.format(counts, domain_class_dict[main_domain])

    complete = title + '\n' + description + '\n' + 'created at: ' + created_at

    return prompt, complete

In [16]:
# format dict type.
def format_sample(sample):
    prompt_text, completion_text = compose_sample(sample)
    return {'prompt': prompt_text, 'completion': completion_text}

In [19]:
# train_data
content_samples = []
count_finan, count_cryto = 0, 0
prompt_answer_text = 'prompt_answer_100.txt' if sample100rows else 'prompt_answer.txt'

with open(os.path.join(local_dir, prompt_answer_text), 'w') as fw_pa:
    for i in range(len(df)):
        row = df.iloc[i]
        if sample100rows:
            if len(content_samples) >= 100:
               break
            if domain_class_dict[row['main_domain']] not in ['Finance and Business News', 'Cryptocurrency and Blockchain']:
                continue
            elif domain_class_dict[row['main_domain']] == 'Finance and Business News':
                if count_finan <= 50: count_finan += 1
                else: continue
            elif domain_class_dict[row['main_domain']] == 'Cryptocurrency and Blockchain':
                if count_cryto <= 50: count_cryto += 1
                else: continue
        formatted_text = format_sample(row)
        content_samples.append(formatted_text)
    if sample100rows:
        assert len(content_samples) == 100
    json.dump(content_samples, fw_pa)

print(len(content_samples))

100


In [None]:
# quantitative mode

In [None]:
# # tokenizer
# tokenized_samples = []
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# for i in range(len(df)):
#     row = df.iloc[i]
#     formatted_text = format_sample(row)

#     tokenized_sample = tokenizer(formatted_text, truncation=True, padding="max_length", max_length=2048)
#     tokenized_samples.append(tokenized_sample)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-132-fed1160225b0>", line 7, in <cell line: 0>
    formatted_text = format_sample(row)
                     ^^^^^^^^^^^^^^^^^^
  File "<ipython-input-128-a7c0180de35b>", line 3, in format_sample
    prompt_text, completion_text = compose_sample(sample)
                                   ^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-127-d122bd55bb53>", line 7, in compose_sample
    counts = count_word_punc(title + description)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-122-85b713afebef>", line 2, in count_word_punc
    nlp = spacy.load('en_core_web_sm')
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/spacy/__init__.py", line 51, in load
    return util.load_model(
           ^^^^^^^^^^^^^^^^
  File "/usr/l

TypeError: object of type 'NoneType' has no len()

In [None]:
# step4 tokenizer

In [20]:
with open(os.path.join(local_dir, prompt_answer_text), 'r') as fr_pa:
    prompt_complete_dict_list = json.load(fr_pa)
    print(len(prompt_complete_dict_list), type(prompt_complete_dict_list[0]))

100 <class 'dict'>


In [None]:
# !!!!加一个数据分析，统计prompt和completion的字的个数
# 小于一定比例的过滤

In [21]:
# train_data
system_prompt = 'You are a financial news editor'
def format_llama2_chat(system_prompt, prompt, completion):
    return f"""<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST] {completion} </s>"""

In [22]:
# 数据集，只需跑一次
train_data = []
for sample in tqdm(prompt_complete_dict_list):
    prompt_text = sample['prompt']
    completion_text = sample['completion']
    prompt_completion_str = format_llama2_chat(system_prompt, prompt_text, completion_text)
    train_data.append(prompt_completion_str)

sorted_len_train_data = sorted(train_data, key=len)

assert len(train_data) == len(prompt_complete_dict_list) == len(sorted_len_train_data)

train_data_txt = 'train_data_100.txt' if sample100rows else 'train_data.txt'
train_data_sorted_len_txt = 'train_data_sorted_len_100.txt' if sample100rows else 'train_data_sorted_len.txt'

with open(os.path.join(local_dir, train_data_txt), 'w') as fw_td:
    json.dump(train_data, fw_td)

with open(os.path.join(local_dir, train_data_sorted_len_txt), 'w') as fw_tdsl:
    json.dump(sorted_len_train_data, fw_tdsl)


100%|██████████| 100/100 [00:00<00:00, 179243.76it/s]
