In [1]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import pandas as pd
import torch.optim as optim

In [2]:
device = ("cuda" if torch.cuda.is_available() else "cpu" )

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL="microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL).to(device)

In [4]:
from transformers import pipeline, set_seed

set_seed(32)
generator = pipeline('text-generation', model="microsoft/phi-1_5", do_sample=True, num_return_sequences=5, device=device)
generator("The man worked as a")




[{'generated_text': 'The man worked as a mail carrier and delivered letters and packages to different homes and businesses. He had'},
 {'generated_text': 'The man worked as a librarian, and he loved his job.\n\nHe enjoyed curating'},
 {'generated_text': 'The man worked as a weaver, crafting cloth from the fibers of plants and animals. He was'},
 {'generated_text': 'The man worked as a doctor and was passionate about healing people. But lately, he had been facing'},
 {'generated_text': 'The man worked as a salesman, but the position was lost.\n\nThe cat was able'}]

In [5]:
class textDataset(Dataset):
    def __init__(self, tokenizer, text, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token
        self.text = text
        self.sequence = []
        for book in text:
            tokenized_output=self.tokenizer(str(book) + self.eos,
                                        truncation=True,
                                        padding='max_length',
                                        max_length=max_len)
            self.sequence.append(torch.tensor(tokenized_output['input_ids']))

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, item):
        return self.sequence[item]

In [6]:
df = pd.read_excel("data-excel.xlsx")
required_columns = ['headline', 'headline sentiment analysis', 'text', 'byline location','editorial notes','news value [nv] assessment']
df = df[required_columns]
df = df.fillna("")
df = df.astype(str)


In [7]:
df

Unnamed: 0,headline,headline sentiment analysis,text,byline location,editorial notes,news value [nv] assessment
0,"Pregnant woman, baby die after Russian bombing...",Negative,A wounded pregnant woman who was taken on a st...,"Mariupol, Ukraine","This story holds high news value, as the Assoc...",3
1,"Inside North Korea: “We are stuck, waiting to ...",Negative,"For months, the BBC has been communicating in ...","Seoul, South Korea","This story was of extremely high value, due to...",3
2,Blair: Why Saddam and his weapons have to be s...,Positive,The prime minister cites intelligence in setti...,"London, UK","This article, written by the Prime Minister, i...",1
3,Andrew Tate and brother Tristan can be extradi...,Neutral,Controversial influencer Andrew Tate and his b...,,The coverage of the court case involving these...,-3
4,The secret deal that saved the Barclays Was th...,Negative,In March 13th Britain’s government said that i...,,This extensively researched article by The Eco...,2
...,...,...,...,...,...,...
1058,Inside Biden’s Anti-Trump Battle Plan (and Whe...,Negative,As former President Donald J. Trump speeds tow...,Washington DC and New York,Journalists must prioritize substance over noi...,-1
1059,The 85-year-old student doing her fourth degree,Positive,An 85-year-old student who is working towards ...,,Considering the involvement of Cotswold PR com...,-1
1060,Fresh volcanic eruption triggers evacuation in...,Negative,A volcanic eruption has begun on the Reykjanes...,"Reykjavík, Iceland",The relevance of the volcanic eruption in Icel...,1
1061,A Dream of Secular Utopia in ISIS’ Backyard At...,Positive,SUMMARY: This article chronicles the author's ...,"Qamishli, Syria","This long form article ""A Dream of Secular Uto...",-1


In [8]:
DEFAULT_SYSTEM_PROMPT = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.\n
""".strip()


def generate_training_prompt(
    text: str, sentiment: str, notes: str, rank:str, location:str
) -> str:
    return f"""{DEFAULT_SYSTEM_PROMPT}

### Article:
{text}

### Sentiment:
{sentiment}

### editorial-notes:
{notes}

### Ranking:
{rank}
### Location:
{location}
""".strip()

In [9]:
traindata = df.iloc[:980]
testdata = df.iloc[980:]

In [10]:
textTrain = []
for i in range(len(traindata)):
    input_prompt = generate_training_prompt(
        text=traindata['text'].iloc[i],
        sentiment=traindata['headline sentiment analysis'].iloc[i],
        notes=traindata['editorial notes'].iloc[i],
        rank=traindata['news value [nv] assessment'].iloc[i],
        location=traindata['byline location'].iloc[i]
    )
    textTrain.append(input_prompt)

In [11]:
len(textTrain)

980

In [12]:
textTest = []
for i in range(len(testdata)):
    input_prompt = generate_training_prompt(
        text=testdata['text'].iloc[i],
        sentiment=testdata['headline sentiment analysis'].iloc[i],
        notes=testdata['editorial notes'].iloc[i],
        rank=testdata['news value [nv] assessment'].iloc[i],
        location=testdata['byline location'].iloc[i]
    )
    textTest.append(input_prompt)

In [13]:
len(textTest)

83

In [14]:
len(textTest[0].split())

1335

In [15]:
def train(dataloader, model, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, X in enumerate(dataloader):
        X = X.to(device)
        pred = model(X, labels=X)
        loss = pred.loss

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if batch % 1000 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            try:
                file = open('loss_pretrain.txt', 'a')
                file.write(f"Training loss: {loss:>7f}  [{current:>5d}/{size:>5d}]\n")
                file.close()
            except:
                print('file not found')
        del X
        torch.cuda.empty_cache()
    model.save_pretrained("optnews")
    

In [16]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X in dataloader:
            X = X.to(device)
            pred = model(X, labels=X)
            test_loss += pred.loss
    test_loss /= num_batches
    try:
        file = open('loss_pretrain.txt', 'a')
        file.write(f"Validation Error: \nAvg loss: {test_loss:>8f} \n")
        file.close()
    except:
        print('file not found')
    print(f"Validation Error: \nAvg loss: {test_loss:>8f} \n")

In [17]:
class ClearCache:
    def __enter__(self):
        torch.cuda.empty_cache()

    def __exit__(self, exc_type, exc_val, exc_tb):
        torch.cuda.empty_cache()

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [19]:
datasetTrain = textDataset(tokenizer, textTrain, 1800)
datasetTest = textDataset(tokenizer, textTest, 1800)

In [20]:
train_dataloader = DataLoader(datasetTrain, batch_size=2, shuffle=True)
test_dataloader = DataLoader(datasetTest, batch_size=2, shuffle=True)

In [None]:
with ClearCache():
    epochs = 20
    for t in range(epochs):
        file = open('loss_pretrain.txt', 'a')
        file.write(f"Epoch {t+1}\n-------------------------------\n")
        file.close()
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, optimizer)
        test(test_dataloader, model)

    print("Done!")

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch 1
-------------------------------
loss: 5.896377  [    2/  980]
Validation Error: 
Avg loss: 1.551274 

Epoch 2
-------------------------------
loss: 0.984500  [    2/  980]
Validation Error: 
Avg loss: 1.558285 

Epoch 3
-------------------------------
loss: 0.978207  [    2/  980]


In [28]:

def generate_query_prompt(
    text: str) -> str:
    return f"""{DEFAULT_SYSTEM_PROMPT}

### Article:
{text}

### Sentiment:

""".strip()



In [34]:
qry = generate_query_prompt(testdata['text'].iloc[11])

In [35]:
qry



In [36]:
generation_config = model.generation_config
generation_config.max_new_tokens = 700
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id


In [38]:

# model.save_pretrained("optnews")


ValueError: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. Fix these issues to save the configuration.

Thrown during validation:
[UserWarning('`do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.'), UserWarning('`do_sample` is set to `False`. However, `top_p` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.')]

In [37]:
%%time

prompt = qry
device = "cuda"
encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True, skip_prompt=True))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Article:

### Sentiment:
Negative

### editorial-notes:
The Telegraph article on the Conservative party conference on gender and trans issues is primarily a commentary piece that lacks depth and substance. It primarily serves as a form of entertainment and diversion for the public, diverting attention away from more important and pressing matters. While it may provide brief amusement or distraction, it does not offer valuable insights or contribute significantly to public discourse or understanding.

### Ranking:
-2
### Location:
CPU times: user 1.7 s, sys: 160 ms, total: 1.86 s
Wall time: 1.86 s
