# PubMed Summarization
### Umema Ashar 22i-2036

this code extracts the dataset from the PubMed_summarization dataset from HuggingFace

1- each article is preprocessed

2- extractive summarization is performed on each article

3- image is generated

4- all summaries are stored in a text file


In [31]:
import datasets

In [24]:
import numpy as np
from PIL import Image
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import time
from datasets import load_dataset

In [25]:
def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Define stopwords and punctuation
    stop_words = set(stopwords.words('english') + list(string.punctuation))
    
    # Remove stopwords, punctuation, and numbers
    tokens = [word for word in tokens if word not in stop_words and not re.search(r'\d', word)]
    
    return tokens

In [26]:
def generate_image_from_text(text):
    image_size = (400, 300) 
    image = np.ones(shape=(image_size[1], image_size[0], 3), dtype=np.uint8) * 255  # White image
    image = Image.fromarray(image)
    return image

In [27]:
def extractive_summarization(article, summary_style='brief'):
    sentences = sent_tokenize(article)
    words = preprocess(article)
    word_freq = Counter(words)
    sentence_scores = {sent: sum(word_freq[word] for word in preprocess(sent)) for sent in sentences}
    top_n = 3 if summary_style == 'brief' else 5
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:top_n]
    return ' '.join(summary_sentences)

In [30]:


def generate_summary_and_image(article):
    summary_style = 'brief'  # You can change this to 'detailed' if needed
    summary = extractive_summarization(article, summary_style)
    generated_image = generate_image_from_text(article)  # Generate image based on text
    return summary, generated_image

def process_dataset(dataset):
    summaries = []
    for example in dataset:
        article = example['article']
        summary, _ = generate_summary_and_image(article)
        summaries.append((article, summary))
    
    # Write summaries to a text file
    output_file = f"summaries_{int(time.time())}.txt"
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for idx, (article, summary) in enumerate(summaries, start=1):
            outfile.write(f"Article {idx}:\n{article}\n\nSummary:\n{summary}\n\n")

    print(f"Summaries saved successfully to {output_file}")



In [None]:
if __name__ == '__main__':
    # Load the dataset
    dataset = load_dataset("ccdv/pubmed-summarization", split="train")
    process_dataset(dataset)
