In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add padding token
tokenizer.pad_token = tokenizer.eos_token  # You can use the end-of-sequence token as the padding token
# # Define the maximum input token limit
# GPT2_INPUT_TOKEN_LIMIT = 12288  # Adjust as needed

# Load and preprocess data
# json_directory = "/content/drive/MyDrive/PLP Project/updated/focus"
json_directory = "/content/drive/MyDrive/focus"

dfs = [pd.read_json(os.path.join(json_directory, file), encoding='latin-1').dropna(subset=['Gemini Summary']).iloc[:, 1:3] for file in os.listdir(json_directory) if os.path.isfile(os.path.join(json_directory, file))]
consolidated_df = pd.concat(dfs, ignore_index=True)

# Rename columns
consolidated_df = consolidated_df.rename(columns={'Gemini Summary': 'text', 'Section Text': 'ctext'})

# Add prefix to source text
consolidated_df.ctext = 'summarize: ' + consolidated_df.ctext

# Split data into train and validation sets
train_data, val_data = train_test_split(consolidated_df, train_size=0.8, random_state=42)

# # Calculate the maximum length of target summaries
# max_target_length = max(len(summary) for summary in consolidated_df['text'])

# # Set max_length slightly higher than the maximum target summary length
# max_length = max_target_length + 50  # Adjust the additional length as needed


# Define dataset class
# Define dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['ctext']
        target_text = self.data.iloc[idx]['text']

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze()
        }

# Create datasets and dataloaders
train_dataset = CustomDataset(train_data, tokenizer, max_length=512)
val_dataset = CustomDataset(val_data, tokenizer, max_length=512)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Fine-tune the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2-medium")  # Load GPT-2 Medium instead of base GPT-2

# Model configuration for sequence-to-sequence task
model.resize_token_embeddings(len(tokenizer))

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the selected device
model.to(device)

# Print the device
print(f"Using device: {device}")


Using device: cuda


### Incorporate ROUGE Scores generation during epoch training

In [None]:
# Installing packages
!pip install rouge-score

import datetime
from openpyxl import Workbook, load_workbook


# Preprocessing summaries before comparison
def rouge_preprocessing (summary_text):
    # Convert to lowercase
    summary_text_lower = summary_text.lower()
    # Remove punctuation
    summary_text_no_punctuation = re.sub(r'[^\w\s]', '', summary_text_lower)
    # Tokenize the text
    summary_text_tokens = word_tokenize(summary_text_no_punctuation)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    summary_text_filtered = [word for word in summary_text_tokens if word not in stop_words]

    # Optionally perform stemming or lemmatization

    # Join the tokens back into a string
    summary_text_processed = ' '.join(summary_text_filtered)

    return summary_text_processed

# Generate rouge scores from summaries
from rouge_score import rouge_scorer

def generate_rouge_scores (reference_summary, candidate_summary):
    # Initialize Rouge scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Compute Rouge scores
    scores = scorer.score(candidate_summary, reference_summary)

    # Unpack scores dictionary and return individual metrics
    rouge1_precision = scores['rouge1'].precision
    rouge1_recall = scores['rouge1'].recall
    rouge1_fmeasure = scores['rouge1'].fmeasure

    rouge2_precision = scores['rouge2'].precision
    rouge2_recall = scores['rouge2'].recall
    rouge2_fmeasure = scores['rouge2'].fmeasure

    rougeL_precision = scores['rougeL'].precision
    rougeL_recall = scores['rougeL'].recall
    rougeL_fmeasure = scores['rougeL'].fmeasure

    return rouge1_precision, rouge1_recall, rouge1_fmeasure, \
           rouge2_precision, rouge2_recall, rouge2_fmeasure, \
           rougeL_precision, rougeL_recall, rougeL_fmeasure

# Export ROUGE Scores to Excel
# Function to create or load workbook and add data
def update_rouge_excel(output_directory, rouge_scores_list):
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_directory), exist_ok=True)

    # Define file path
    file_path = f"{output_directory}.xlsx"

    try:
        # Load existing workbook
        workbook = load_workbook(file_path)
        worksheet = workbook.active
    except FileNotFoundError:
        # If workbook doesn't exist, create a new one
        workbook = Workbook()
        worksheet = workbook.active
        # Add headers to the first row
        header = ["rouge1_precision", "rouge1_recall", "rouge1_fmeasure", "rouge2_precision",
                  "rouge2_recall", "rouge2_fmeasure", "rougeL_precision", "rougeL_recall", "rougeL_fmeasure", "rougeL_sum"
                 ]
        worksheet.append(header)

    for rouge_scores in rouge_scores_list:
        # Calculate Rouge-L sum score
        rougeL_sum = rouge_scores[6] + rouge_scores[7] + rouge_scores[8]  # Sum of Rouge-L precision, recall, and F-measure
        worksheet.append(rouge_scores + (rougeL_sum,))
    workbook.save(file_path)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=61264377570bafeb0a7f99953924725477b39af6ecab3a0934e580bce0339fb1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
# Specify the path to the JSON folder
# json_directory = "/content/drive/MyDrive/PLP Project/updated/focus/model_1"
json_directory = "/content/drive/MyDrive/focus/model_3"

# Create excel file named GPT2_finetuned_rouge_score if it does not exist
# Headers to consist of file_name, item_number, 9 rouge scores values from generate_rouge_scores function
output_file_name = 'GPT2_finetuned_rouge_score'
output_directory = os.path.join(json_directory, output_file_name)


In [None]:
from tqdm import tqdm
import re  # Import the re module for regular expressions
from nltk.tokenize import word_tokenize  # Import the word_tokenize function from nltk
import nltk  # Import the nltk library
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords  # Import the stopwords from nltk

# Create an empty list to store generated outputs
generated_outputs_train = []
generated_outputs_val = []
# Initialize empty lists to store ROUGE scores
rouge_scores_train = []
rouge_scores_val = []

# Training loop
for epoch in range(10):  # TRAIN_EPOCHS
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}")
    for batch_idx, batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})

        # Generate outputs and store them
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs.logits.argmax(dim=-1)]
        generated_outputs_train.extend(generated_texts)

        # Calculate ROUGE scores
        for generated_summary, reference_summary in zip(generated_texts, labels):  # Use labels as reference summaries
            reference_summary_processed = rouge_preprocessing(tokenizer.decode(reference_summary, skip_special_tokens=True))  # Decode labels to text
            generated_summary_processed = rouge_preprocessing(generated_summary)
            rouge_scores = generate_rouge_scores(reference_summary_processed, generated_summary_processed)
            rouge_scores_train.append(rouge_scores)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    for batch_idx, batch in enumerate(val_loader):
        with torch.no_grad():
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

            # Generate outputs and store them
            generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs.logits.argmax(dim=-1)]
            generated_outputs_val.extend(generated_texts)

            # Calculate ROUGE scores
            for generated_summary, reference_summary in zip(generated_texts, labels):  # Use labels as reference summaries
                reference_summary_processed = rouge_preprocessing(tokenizer.decode(reference_summary, skip_special_tokens=True))  # Decode labels to text
                generated_summary_processed = rouge_preprocessing(generated_summary)
                rouge_scores = generate_rouge_scores(reference_summary_processed, generated_summary_processed)
                rouge_scores_val.append(rouge_scores)

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Average Train Loss: {avg_train_loss:.4f}, Average Val Loss: {avg_val_loss:.4f}")

# Add generated outputs as new columns to consolidated_df
train_data["generated_output"] = generated_outputs_train[:len(train_data)]  # Ensure length matches train_data
val_data["generated_output"] = generated_outputs_val[:len(val_data)]  # Ensure length matches val_data

# Add the generated ROUGE scores to an Excel file
update_rouge_excel(output_directory, rouge_scores_train)
update_rouge_excel(output_directory, rouge_scores_val)

# Save consolidated_df to a JSON file
# Save consolidated_df to a JSON file
# consolidated_df["generated_output"] = generated_outputs_train + generated_outputs_val  # Combine train and validation generated outputs
# consolidated_df.to_json("output_with_generated.json", orient="records", lines=True)  # Save the DataFrame to JSON


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Epoch 1: 100%|██████████| 516/516 [03:46<00:00,  2.28it/s, loss=2.91]


Epoch 1, Average Train Loss: 3.1626, Average Val Loss: 2.8732


Epoch 2: 100%|██████████| 516/516 [03:46<00:00,  2.28it/s, loss=3.27]


Epoch 2, Average Train Loss: 2.8425, Average Val Loss: 2.7828


Epoch 3: 100%|██████████| 516/516 [03:48<00:00,  2.26it/s, loss=1.23]


Epoch 3, Average Train Loss: 2.7622, Average Val Loss: 2.7581


Epoch 4: 100%|██████████| 516/516 [03:48<00:00,  2.26it/s, loss=2.7]


Epoch 4, Average Train Loss: 2.6822, Average Val Loss: 2.7603


Epoch 5: 100%|██████████| 516/516 [03:49<00:00,  2.25it/s, loss=4.23]


Epoch 5, Average Train Loss: 2.6279, Average Val Loss: 2.7167


Epoch 6: 100%|██████████| 516/516 [03:49<00:00,  2.25it/s, loss=2.94]


Epoch 6, Average Train Loss: 2.5661, Average Val Loss: 2.7175


Epoch 7: 100%|██████████| 516/516 [03:49<00:00,  2.25it/s, loss=2.08]


Epoch 7, Average Train Loss: 2.5140, Average Val Loss: 2.6995


Epoch 8: 100%|██████████| 516/516 [03:50<00:00,  2.24it/s, loss=1.02]


Epoch 8, Average Train Loss: 2.4640, Average Val Loss: 2.7690


Epoch 9: 100%|██████████| 516/516 [03:50<00:00,  2.24it/s, loss=0.909]


Epoch 9, Average Train Loss: 2.4141, Average Val Loss: 2.7704


Epoch 10: 100%|██████████| 516/516 [03:49<00:00,  2.24it/s, loss=3.05]


Epoch 10, Average Train Loss: 2.3549, Average Val Loss: 2.8181


In [None]:
# Take the last 1032 generated outputs for train_data
train_data["generated_output"] = generated_outputs_train[-1032:]

# Take the last 259 generated outputs for val_data
val_data["generated_output"] = generated_outputs_val[-259:]

# Combine train_data and val_data into a single Dataframe
consolidated_df = pd.concat([train_data, val_data], ignore_index = True)

# Save consolidated_df to a JSON file
consolidated_df.to_json("output_with_generated.json", orient="records", lines=True)

In [None]:
# Define the directory path where you want to save the model
# model_directory = "/content/drive/MyDrive/PLP Project/updated/focus/model_1"
model_directory = "/content/drive/MyDrive/focus/model_3"


# Create the directory if it doesn't exist
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Save the model
model.save_pretrained(model_directory)

In [None]:
# Access the original text and target summary for the first sample in the training data
original_text_train = train_data.iloc[0]['ctext']
target_summary_train = train_data.iloc[0]['text']

# Access the original text and target summary for the first sample in the validation data
original_text_val = val_data.iloc[0]['ctext']
target_summary_val = val_data.iloc[0]['text']

# Print the original text and target summary for the first sample in the training data
# print("Original Text (Training):", original_text_train)
print("Target Summary (Training):", target_summary_train)

# # Print the original text and target summary for the first sample in the validation data
# print("Original Text (Validation):", original_text_val)
# print("Target Summary (Validation):", target_summary_val)


Target Summary (Training): **Summary of Amazon.com, Inc. Financial Statements and Supplementary Data**

**Item 8: Consolidated Financial Statements and Supplementary Data**

**Financial Statements:**

* Consolidated Statements of Cash Flows
* Consolidated Statements of Operations
* Consolidated Statements of Comprehensive Income (Loss)
* Consolidated Balance Sheets
* Consolidated Statements of Stockholders' Equity

**Supplementary Data:**

* Notes to Consolidated Financial Statements

**Key Financials:**

**Revenue:**
* 2020: $215,915 million
* 2021: $241,787 million
* 2022: $242,901 million

**Operating Income:**
* 2020: $22,899 million
* 2021: $24,879 million
* 2022: $12,248 million

**Net Income (Loss):**
* 2020: $21,331 million
* 2021: $33,364 million
* 2022: $(2,722) million

**Adjusted EBITDA:**
* 2020: $39.7 billion
* 2021: $44.2 billion
* 2022: $55.8 billion

**Key Metrics:**

* **Diluted EPS:**
    * 2020: $2.09
    * 2021: $3.24
    * 2022: $(0.27)
* **Operating Margin:**
   

In [None]:
print(generated_outputs_train[0])  # Print the first generated summary in the training set


-x" ":,0:

 first goal officer is in the, PA York, is headquarters in in Newo, New, and have which
 are have, and located principal important shareholders officers.
We addition with our this operating, and, marketing our products and we of of atheuring and distribution, andling, packaging facilities. and, and centers, and facilities, andandices, and the stores, and and development,, facilities facilities.
 of which we locatedowned located or controlled by
Weificant business are our include
 follows:

: Property Locationed PropertyOwnased

O
 and Development facilities
o, Texas,ed/
west Research,crete
 distribution
 City, Texas Owned

& Con and development facility Plan, New York Owned
PBNA Researchrated, and, Iowa Owned
PBitude,venient food plant Austineste, Texas Owned
PBAm Con- food plant Celerojo, California Owned
Lat Twovenient food plant Cel, New Kingdom Owned
and)
Q Europevenient food plant plantima, Mexico Owned
Europe Con facility Kman,, Russia Owned
Europe Con plant,, Russia O

In [None]:
print(generated_outputs_val[0])    # Print the first generated summary in the validation set


 of of reserved******** the the,,,,, stock stock,,,,,,,,,, $ in,.,.. stock,, in the,,,,, the, to


In [None]:
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_json("/content/output_with_generated.json", lines=True)

# Now you can work with the DataFrame 'df'
print(df.head())  # Display the first few rows of the DataFrame


                                               ctext  \
0  summarize: Item 8. Financial Statements and Su...   
1  summarize: Item 2. Properties.\nOur principal ...   
2  summarize: ITEM 1A. Risk Factors.\nWe have lis...   
3  summarize: ITEM 8. FINANCIAL STATEMENTS AND SU...   
4  summarize: ITEM 1A. RISK FACTORS\nCAUTIONARY S...   

                                                text  \
0  **Summary of Amazon.com, Inc. Financial Statem...   
1  PepsiCo's significant corporate properties inc...   
2  Ford and Ford Credit face operational risks fr...   
3  The financial section of the report presents:\...   
4  **Summary of Risk Factors**\n\nUnitedHealth Gr...   

                                    generated_output  
0   company of reserved**\n safetyosures Dis not ...  
1   company of the**\n\n********\n****\n**.\n**\n...  
2   6 of**\n\n****\n**\n\n******\n\n\n\n\n\n\n\n*...  
3   of of.\n\n\n\n\n**\n\n\n\n\n\n,isks and,\n\n,...  
4   6 of****\n\n\n\n**\n\n**\n\n\n**\n\n\n\n\n\n\..