In [23]:
import os
import torch
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoModel,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
import random
import numpy as np
from typing import Literal, Optional, TypedDict
from sklearn.model_selection import train_test_split
import json

In [12]:
class URLCiteDataset(torch.utils.data.Dataset):
    '''
    create dataset
    - init
    - len
    - getitem
    '''
    def __init__(self, texts: list[str]):
        self.texts = texts

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

In [11]:
!nvidia-smi

Mon Dec 30 02:16:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-32GB           On  | 00000000:18:00.0 Off |                    0 |
| N/A   31C    P0              22W / 250W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-32GB           On  | 00000000:AF:00.0 Off |  

In [13]:
csv_dataset = pd.read_csv("/data/group1/z40436a/ME/URL_Citation_Classification_Intermediate/data/all_data.csv", encoding="utf-8")

seed = 111 # fixed
train_df, eval_df = train_test_split(csv_dataset, test_size = 0.1, random_state=seed)
print("train_data_size:::", len(train_df))
print("test_data_size:::", len(eval_df))

train_data_size::: 2690
test_data_size::: 299


In [14]:
import nltk
import re

CITE_TOKEN = "[URL_CITE]"

def replace_tag(sentences: pd.Series) -> list[str]:
    # replace [Cite_****] to [Cite] token
    rule = re.compile(r'\[Cite[^\[\] ]*\]')
    sentences_replaced:list[str] = list()
    for sentence in sentences:
        sentences_replaced.append(rule.sub(CITE_TOKEN, sentence))

    return sentences_replaced

def get_3sent(paragraphs:list[str]) -> list[str]:
    ret:list[list[str]] = list()
    for paragraph in paragraphs:
        sentences: list[str] = nltk.sent_tokenize(paragraph)
        if not len(sentences):
            print('!!!')
        if len(sentences) < 4:
            ret.append(sentences)
            continue
        else:
            for i in range(len(sentences)):
                if CITE_TOKEN in sentences[i]:
                    if i == 0:
                        ret.append(sentences[i:i+2])
                    elif i == len(sentences)-1:
                        ret.append(sentences[i-1:i+1])
                    else:
                        ret.append(sentences[i-1:i+2])
                    break
                if i == len(sentences)-1:
                    # print(sentences)
                    pass
    cont_3sent = [" ".join(sent) for sent in ret]
    return cont_3sent

In [26]:
def read_icl(file_path:str) -> list[list[int]]:
    '''
    return icl_idx top-k (from left)
    '''
    icl_idxs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line == '\n':
                break
            icl_idxs.append(json.loads(line))
    return icl_idxs

In [30]:
icl_path = f"/data/group1/z40436a/ME/URL_Citation_Classification_Intermediate/icl/random/{str(seed)}.txt"
icl_idxs = read_icl(icl_path)

In [80]:
def create_inst(train_df:pd.DataFrame, test_df:pd.DataFrame, icl_method:str, k:int=5) -> list[str]:
    texts: list[str] = []

    icl_idxs = read_icl(f"/data/group1/z40436a/ME/URL_Citation_Classification_Intermediate/icl/{icl_method}/{str(seed)}.txt")
    
    train_replaced_sentences = replace_tag(train_df['citation-paragraph'])
    train_conts = get_3sent(train_replaced_sentences)

    test_replaced_sentences = replace_tag(test_df['citation-paragraph'])
    test_conts = get_3sent(test_replaced_sentences)

    for test_cont, (i, row) in zip(test_conts, test_df.iterrows()):
        reset_idx = 0
        instruction = [
            {"role":"System", "content": f"""Your task is to classify the type of artifact (TYPE) reffered to the URL and the citation reason (FUNCTION). I will provide you with a URL and citation context, section titles.\n
Here is the classification schema for the artifact type:
1. Tool: toolkit, software, system
2. Code: codebase, library, API
3. Dataset: corpus, image, sets
4. Knowledge: lexicon, knowledge graph
5. DataSource: source data for the Dataset/Knowledge
6. Document: specifications, guidelines
7. Paper: scholarly papers
8. Media: games, music, videos
9. Website: services, homepages
10. Mixed: citations referring to multiple resources
    
Here is the classification schema for the citation reason:
1. Use: Used in the citing paper’s research
2. Produce: First produced or released by the citing paper’s research
3. Compare: Compared with other resources
4. Extend: Used in the citing paper’s research but are improved, upgraded, or changed during the research
5. Introduce: The resources or the related information
6. Other: The URL citation does not belong to the above categories"""}
        ]

        if k == 0:
            pass
        elif k > 0 and k <=5:
            for top_k in range(k):
                icl_idx = icl_idxs[reset_idx][top_k]
                icl_df = train_df.iloc[icl_idx]
                # print(icl_df)
                icl_input = f"""Please classify the artifact type and the citation reason for the following URL and citation sentence.
URL: {icl_df['url']}
Citation Context: {train_conts[icl_idx]}
Footnote or Reference text (if exists): {icl_df['citation-info']}
Section Titles (if exists): {icl_df['passage-title']}"""
                instruction.append({"role":"user", "content": icl_input})
                instruction.append({"role":"assistant", "content": f"""TYPE: {icl_df['type']}\nFUNCTION: {row['function'].split("（")[0]}"""})
        else:
            print("error")

        test_input = f"""Please classify the artifact type and the citation reason for the following URL and citation sentence.
URL: {row['url']}
Citation Context: {test_cont}
Footnote or Reference text (if exists): {row['citation-info']}
Section Titles (if exists): {row['passage-title']}"""
        instruction.append({"user": test_input})

        reset_idx += 1

        texts.append(instruction)
    return texts

In [84]:
### test_code create_inst
k = 5
random_icl = create_inst(train_df, eval_df, "random", k)
bm25_icl = create_inst(train_df, eval_df, "bm25", k)
encoder_icl = create_inst(train_df, eval_df, "encoder", k)

if len(random_icl) == len(bm25_icl) == len(encoder_icl):
    print(f"len is OK!, len is {len(random_icl)}")

if

len is OK!, len is 299


In [82]:
texts = create_inst(train_df, eval_df, "random", k=5)[0]
print(texts)
for part in texts:
    print(part['content'])

[{'role': 'System', 'content': 'Your task is to classify the type of artifact (TYPE) reffered to the URL and the citation reason (FUNCTION). I will provide you with a URL and citation context, section titles.\n\nHere is the classification schema for the artifact type:\n1. Tool: toolkit, software, system\n2. Code: codebase, library, API\n3. Dataset: corpus, image, sets\n4. Knowledge: lexicon, knowledge graph\n5. DataSource: source data for the Dataset/Knowledge\n6. Document: specifications, guidelines\n7. Paper: scholarly papers\n8. Media: games, music, videos\n9. Website: services, homepages\n10. Mixed: citations referring to multiple resources\n    \nHere is the classification schema for the citation reason:\n1. Use: Used in the citing paper’s research\n2. Produce: First produced or released by the citing paper’s research\n3. Compare: Compared with other resources\n4. Extend: Used in the citing paper’s research but are improved, upgraded, or changed during the research\n5. Introduce: 

KeyError: 'content'

In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
pipeline = pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    max_new_tokens = 100
)

In [None]:
pipeline

In [90]:
import json
output_dir = f"/data/group1/z40436a/ME/URL_Citation_Classification_Intermediate/icl/{ICL_METHOD}"
with open(f"{output_dir}/{str(seed)}.txt", "w") as jsonl_file:
    for icl in icls:
        json.dump(icl, jsonl_file)
        jsonl_file.write("\n")