In [6]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from datasets import load_dataset
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the NLTK data required
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
data=load_dataset("ccdv/pubmed-summarization", trust_remote_code=True)

In [9]:
data

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

In [10]:
data_tr = load_dataset("ccdv/pubmed-summarization", split="train")
data_tr_sample = load_dataset("ccdv/pubmed-summarization", split="train[:1000]+validation[:200]")

print(data_tr)
print(data_tr_sample)

Dataset({
    features: ['article', 'abstract'],
    num_rows: 119924
})
Dataset({
    features: ['article', 'abstract'],
    num_rows: 1200
})


In [11]:
data_tr[0]

{'article': "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . \n in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . \n the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . \n anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . \n snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states \n there are also some reports regarding school feeding programs in developi

In [12]:
articles = [example['article'] for example in data_tr]
summaries = [example['abstract'] for example in data_tr]
articles[0]

"a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . \n in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . \n the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . \n anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . \n snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states \n there are also some reports regarding school feeding programs in developing countries

In [13]:
def lowercase_text(text):
    return text.lower()

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = remove_extra_spaces(text)
    return text


In [14]:
# Preprocess articles
preprocessed_articles = [preprocess_text(article) for article in articles]

# Preprocess abstracts
preprocessed_abstracts = [preprocess_text(abstract) for abstract in summaries]

In [15]:
# Combine articles and abstracts into a DataFrame
data = {'article': preprocessed_articles, 'abstract': preprocessed_abstracts}
df = pd.DataFrame(data)

# Save to a CSV file
df.to_csv('preprocessed_pubmed_data.csv', index=False)


In [16]:
print(preprocessed_articles[0])

recent systematic analysis showed million children younger years mildly moderately severely stunted million mildly moderately severely underweight developing countries iran study among high school girls sistan baluchestan showed prevalence underweight overweight obesity respectively prevalence malnutrition among elementary school aged children tehran varied anthropometric study elementary school students shiraz revealed suffer malnutrition low body weight snack kcal energy could provide g protein day nowadays school nutrition programs running national programs world wide national school lunch program united states also reports regarding school feeding programs developing countries vietnam school base program showed improvement nutrient intakes iran national free food program nffp implemented elementary schools deprived areas cover poor students however program conducted slums poor areas big cities many malnourished children low socio economic situation covered nffp although rate povert