In [13]:
import random
import datasets
from transformers import BertTokenizer, BertForMaskedLM
import torch
import spacy
import seaborn as sns
import matplotlib.pyplot as plt


In [14]:
ENDMARK = '<<<END>>>'
DATASETS = ['pubmed', 'writing']

tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split='train')
data = [f'Question: {q} Answer:{ENDMARK}{a}' for q, a in zip(data['question'], data['long_answer'])]

Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


In [4]:
print(data)



In [17]:
nlp = spacy.load("en_core_web_sm")

data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split='train')
filtered_data = [(q, (nlp(a), 0)) for q, a in zip(data['question'], data['long_answer']) if
                         len(tokenizer(a)['input_ids']) <= 512]

Found cached dataset pubmed_qa (C:/Users/atana/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/dd4c39f031a958c7e782595fa4dd1b1330484e8bbadd4d9212e5046f27e68924)


In [18]:
##As mentioned earlier, traditional perplexity cannot be computed directly with BERT models including SciBERT,
##because these models don't model the entire sequence probability. 
## However, you can approximate a kind of perplexity score by taking the negative 
##log-likelihood of each word given its context in the sentence, summing these up, and taking the exponential of the result.

In [20]:
filtered_pos_tags = []
for question, (answer_doc, _) in filtered_data:
    pos_tags = [token.pos_ for token in answer_doc]
    filtered_pos_tags.append(pos_tags)


In [21]:
for i, pos_tags in enumerate(filtered_pos_tags):
    print(f"Filtered Long Answer {i+1} POS tags: {pos_tags}")

Filtered Long Answer 1 POS tags: ['NOUN', 'VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'SCONJ', 'PROPN', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'PRON', 'AUX', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'NOUN', 'VERB', 'ADP', 'ADJ', 'NOUN', 'PART', 'VERB', 'DET', 'NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'ADJ', 'PROPN', 'PUNCT', 'ADV', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADV', 'PUNCT', 'PRON', 'NOUN', 'VERB', 'DET', 'PROPN', 'ADP', 'VERB', 'DET', 'ADJ', 'CCONJ', 'ADJ', 'NOUN', 'ADP', 'ADV', 'VERB', 'PROPN', 'ADP', 'DET', 'NOUN', 'NOUN', 'PUNCT']
Filtered Long Answer 2 POS tags: ['VERB', 'DET', 'NOUN', 'VERB', 'PUNCT', 'PRON', 'VERB', 'ADV', 'DET', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP

In [22]:
from collections import Counter

conjunction_counts = Counter()

for question, (answer_doc, _) in filtered_data:
    pos_tags = [token.pos_ for token in answer_doc]
    conjunctions = [tag for tag in pos_tags if tag == "ADJ"]
    conjunction_counts.update(conjunctions)

# Print the count of conjunctions
print("Conjunction Counts:")
for conjunction, count in conjunction_counts.items():
    print(f"{conjunction}: {count}")

Conjunction Counts:
ADJ: 5748


In [None]:
data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split='train')
data = [f'Question: {q} Answer:{ENDMARK}{a}' for q, a in zip(data['question'], data['long_answer'])]


def strip_newlines(text):
    return ' '.join(text.split())


# remove duplicates from the data
data = list(dict.fromkeys(data))  # deterministic, as opposed to set()

# strip whitespace around each example
data = [x.strip() for x in data]

# remove newlines from each example
data = [strip_newlines(x) for x in data]

random.seed(0)
random.shuffle(data)

# Label the data
data = [(x, 0) for x in data]

# POS and punctuation count dictionaries
pos_counts = Counter()
punctuation_counts = Counter()
function_word_counts = Counter()

# Sentence length list
sentence_lengths = []

# Process answers in the data
for text, label in data:
    question, answer = text.split(f'{ENDMARK}')
    doc = nlp(answer)
    
    # Update POS, punctuation and function word counts
    pos_counts.update([token.pos_ for token in doc])
    punctuation_counts.update([token.text for token in doc if token.is_punct])
    function_word_counts.update([token.text for token in doc if token.is_stop])
    
    # Compute sentence lengths
    sentence_lengths.extend([len(sentence) for sentence in doc.sents])

# Calculate average sentence length
avg_sentence_length = sum(sentence_lengths) / len(sentence_lengths)

# Print statistics
print(f"Frequency of adjectives: {pos_counts['ADJ']}")
print(f"Frequency of adverbs: {pos_counts['ADV']}")
print(f"Frequency of conjunctions: {pos_counts['CONJ']}")
print(f"Frequency of nouns: {pos_counts['NOUN']}")
print(f"Frequency of numbers: {pos_counts['NUM']}")
print(f"Frequency of pronouns: {pos_counts['PRON']}")
print(f"Frequency of verbs: {pos_counts['VERB']}")
print(f"Frequency of commas: {punctuation_counts[',']}")
print(f"Frequency of fullstops: {punctuation_counts['.']}")
print(f"Frequency of special character '-': {punctuation_counts['-']}")
print(f"Frequency of function word 'a': {function_word_counts['a']}")
print(f"Frequency of function word 'in': {function_word_counts['in']}")
print(f"Frequency of function word 'of': {function_word_counts['of']}")
print(f"Frequency of function word 'the': {function_word_counts['the']}")
print(f"Average sentence length: {avg_sentence_length}")