In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd

base_folder = '/content/drive/MyDrive/BBC News Summary'

# Initialize empty lists to store the data
article_texts = []
summaries = []

Reading subfolders and perform UTF encoding for reading special symbolsand also skipping corrupt encoded files

In [None]:
# Loop through the subfolders (001, 002, etc.) inside 'News Articles' and 'Summaries'
for folder_name in os.listdir(os.path.join(base_folder, 'News Articles')):

    # Construct the paths to the TXT files in 'News Articles' and 'Summaries' folders
    article_folder = os.path.join(base_folder, 'News Articles', folder_name)
    summary_folder = os.path.join(base_folder, 'Summaries', folder_name)

    # Loop through the TXT files in the current subfolder
    for file_name in os.listdir(article_folder):
        # Construct the full paths to the TXT files
        article_file = os.path.join(article_folder, file_name)
        summary_file = os.path.join(summary_folder, file_name)

        # Check if the files exist
        if os.path.exists(article_file) and os.path.exists(summary_file):
            try:
                # Read the text and summary data from the TXT file with UTF-8 encoding
                article_df = open(article_file, 'r', encoding='utf-8').read()
                summary_df = open(summary_file, 'r', encoding='utf-8').read()

                # Append the article text and summary to the respective lists
                article_texts.append(article_df)
                summaries.append(summary_df)
            except UnicodeDecodeError:
                # Handle files with incorrect encoding
                print(f"File with incorrect encoding: {article_file}")
                continue

# Create a Pandas DataFrame with the collected data
data = {'Article Text': article_texts, 'Summary': summaries}
df = pd.DataFrame(data)

File with incorrect encoding: /content/drive/MyDrive/BBC News Summary/News Articles/sport/199.txt


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data from a CSV file
data = df

# Ensure the column names match your DataFrame's column names
data.columns = ['Article Text', 'Summary']

# Split data into training, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

Preparing data to give input into T5


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from tqdm import tqdm  # tqdm use for progress bar

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

def convert_to_features(index, row): #Row has article and corresponding summary
    # Encode the articles and summaries to the format expected by T5
    input_encodings = tokenizer(row['Article Text'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    target_encodings = tokenizer(row['Summary'], truncation=True, padding='max_length', max_length=150, return_tensors="pt")

    return {
        'input_ids': input_encodings['input_ids'], #Encoded articles
        'labels': target_encodings['input_ids'] #Encoded sumaries
    }

# Convert your data batches to features with progress bar
train_features = [convert_to_features(index, row) for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc='Processing Training Data')]
val_features = [convert_to_features(index, row) for index, row in tqdm(val_data.iterrows(), total=len(val_data), desc='Processing Validation Data')]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Processing Training Data: 100%|██████████| 1556/1556 [00:05<00:00, 275.95it/s]
Processing Validation Data: 100%|██████████| 334/334 [00:01<00:00, 278.41it/s]


Train the Model




In [None]:
#Create DatasetClass
from torch.utils.data import Dataset, DataLoader

class TextSummarizationDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

# Instantiate the dataset class with your data
train_dataset = TextSummarizationDataset(train_features)
val_dataset = TextSummarizationDataset(val_features)
#CreateData Loaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

Training Loop

In [None]:
import torch.optim as optim
from tqdm import tqdm
import torch

# Specify the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transfer the model to the GPU
model = model.to(device)

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Specify the number of epochs
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}'):
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(dim=1).to(device)
        labels = batch['labels'].squeeze(dim=1).to(device)
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f'Validation Epoch {epoch + 1}'):
            inputs = batch['input_ids'].squeeze(dim=1).to(device)
            labels = batch['labels'].squeeze(dim=1).to(device)
            outputs = model(input_ids=inputs, labels=labels)
            total_val_loss += outputs.loss.item()

    print(f'Epoch {epoch + 1}, Training Loss: {total_train_loss / len(train_dataloader)}, Validation Loss: {total_val_loss / len(val_dataloader)}')

Training Epoch 1: 100%|██████████| 195/195 [01:08<00:00,  2.86it/s]
Validation Epoch 1: 100%|██████████| 42/42 [00:05<00:00,  8.30it/s]


Epoch 1, Training Loss: 1.0745473059324118, Validation Loss: 0.6611065328830764


Training Epoch 2: 100%|██████████| 195/195 [01:09<00:00,  2.80it/s]
Validation Epoch 2: 100%|██████████| 42/42 [00:05<00:00,  8.31it/s]


Epoch 2, Training Loss: 0.7091918324812865, Validation Loss: 0.6135898353088469


Training Epoch 3: 100%|██████████| 195/195 [01:09<00:00,  2.80it/s]
Validation Epoch 3: 100%|██████████| 42/42 [00:05<00:00,  8.29it/s]

Epoch 3, Training Loss: 0.6446317163797525, Validation Loss: 0.5915449847068105





In [None]:
model.save_pretrained('Model-Files')
tokenizer.save_pretrained('Model-Files')

('Model-Files/tokenizer_config.json',
 'Model-Files/special_tokens_map.json',
 'Model-Files/spiece.model',
 'Model-Files/added_tokens.json')

Inference or Testing

In [None]:
# Example of generating a summary
def summarize(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(
    inputs['input_ids'],
    max_length=150,
    num_beams=4, # 4 num of choices model has in which model will choose one best sentence for output
    length_penalty=2.0, # it controls length of senetnce how much long or short it should be.
    early_stopping=True,
    no_repeat_ngram_size=2  # This parameter can help reduce repetition
).to(device)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Use the function to summarize a new article
article= '''
Title: "Tech Trends Shaping the Future: Innovations, Challenges, and Opportunities"

Introduction

Technology has become an integral part of our daily lives, revolutionizing the way we work, communicate, and interact with the world around us. With each passing year, we witness remarkable advancements in the tech industry that not only change the way we live but also offer new avenues for growth and development. In this article, we will explore some of the most significant tech trends that are shaping the future, as well as the challenges and opportunities they present.

1. Artificial Intelligence (AI) and Machine Learning

Artificial intelligence and machine learning have seen unprecedented growth in recent years. These technologies have the potential to transform industries such as healthcare, finance, and transportation. AI-powered chatbots, autonomous vehicles, and predictive analytics are just a few examples of how AI and machine learning are changing the way we work and live. However, ethical concerns, data privacy issues, and the need for responsible AI development are challenges that must be addressed as these technologies continue to advance.

2. Internet of Things (IoT)

The Internet of Things is connecting everyday objects and devices to the internet, enabling them to communicate and share data. From smart thermostats and wearable fitness trackers to smart cities' infrastructure, IoT is enhancing efficiency, convenience, and sustainability. However, the massive influx of data generated by IoT devices poses security and privacy concerns, requiring robust cybersecurity measures to protect sensitive information.

3. 5G Technology

The rollout of 5G networks is set to revolutionize connectivity, offering faster speeds and lower latency. This will unlock new possibilities for applications such as augmented reality (AR), virtual reality (VR), and the Internet of Things. The increased bandwidth will pave the way for innovations in telemedicine, autonomous vehicles, and remote work. However, concerns about the deployment of 5G infrastructure and potential health risks associated with prolonged exposure to high-frequency electromagnetic radiation are subjects of ongoing debate.

4. Cybersecurity

As technology continues to advance, the threat landscape in the digital world is also evolving. Cybersecurity is a critical concern for individuals, businesses, and governments. The rise of sophisticated cyberattacks, ransomware incidents, and data breaches underscores the need for robust security measures. Organizations must invest in cybersecurity technologies and develop proactive strategies to protect sensitive data and infrastructure.

5. Green Technology and Sustainability

The tech industry is increasingly focused on sustainability and environmental responsibility. Renewable energy sources, energy-efficient data centers, and eco-friendly products are at the forefront of this movement. As consumers become more environmentally conscious, companies are finding that sustainable practices are not only ethical but also profitable. Investing in green technology and adopting eco-friendly practices can lead to cost savings and a positive brand image.

6. Remote Work and Collaboration Tools

The COVID-19 pandemic accelerated the adoption of remote work and collaboration tools. Video conferencing, project management software, and cloud-based services have become essential for businesses to maintain productivity and adapt to remote work environments. While remote work offers flexibility and accessibility, it also presents challenges related to employee well-being, cybersecurity, and maintaining a sense of company culture.

7. Quantum Computing

Quantum computing is on the horizon and has the potential to revolutionize computing as we know it. With the ability to perform complex calculations at speeds unattainable by classical computers, quantum computers could solve problems in fields like cryptography, drug discovery, and climate modeling. However, building and maintaining stable quantum systems remains a significant technological challenge.

Conclusion

The tech landscape is continually evolving, presenting both exciting opportunities and complex challenges. As we embrace innovations in artificial intelligence, IoT, 5G, cybersecurity, sustainability, remote work, and quantum computing, it is crucial to approach these developments with a thoughtful and ethical perspective. Balancing the potential for progress with responsible development and addressing the associated risks will be key to harnessing the full potential of technology in shaping our future. By staying informed and proactive, individuals, businesses, and society as a whole can navigate the ever-changing tech landscape with confidence and resilience.'''
new_summary = summarize(article)
print(new_summary)

In this article, we will explore some of the most significant tech trends that are shaping the future, as well as the challenges and opportunities they present.The Internet of Things is connecting everyday objects and devices to the internet, enabling them to communicate and share data.Although, ethical concerns, data privacy issues, and the need for responsible AI development are challenges that must be addressed as these technologies continue to advance.
