In [None]:
!pip install datasets
!pip install torch
!pip install spacy
!python -m spacy download en_core_web_trf
!pip install spacy-transformers

In [2]:
from datasets import load_dataset

flore_200 = load_dataset("facebook/flores", "eng_Latn", split='devtest')

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the module from /home/liu/.cache/huggingface/modules/datasets_modules/datasets/facebook--flores/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef (last modified on Thu Jan 25 20:20:03 2024) since it couldn't be found locally at facebook/flores, or remotely on the Hugging Face Hub.


In [3]:
len(flore_200)

1012

In [4]:
flore_200['sentence'][0:5]

['"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
 'Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.',
 'Like some other experts, he is skeptical about whether diabetes can be cured, noting that these findings have no relevance to people who already have Type 1 diabetes.',
 'On Monday, Sara Danius, permanent secretary of the Nobel Committee for Literature at the Swedish Academy, publicly announced during a radio program on Sveriges Radio in Sweden the committee, unable to reach Bob Dylan directly about winning the 2016 Nobel Prize in Literature, had abandoned its efforts to reach him.',
 'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enough."']

In [5]:
import spacy
from spacy.matcher import Matcher

In [29]:
def extract_general_subjects(doc):
    named_entities = set(ent.text for ent in doc.ents) # exclude named entities
    general_subjects = []
    for chunk in doc.noun_chunks:
        if chunk.root.dep_ in ["nsubj", "nsubjpass"]:
            if not any(ne in chunk.text or chunk.text in ne for ne in named_entities): # exclude partial inclusion
                general_subjects.append(chunk.text)
    return general_subjects

In [9]:
def extract_verbs_phrase(doc, nlp):
    verb_phrases = {"transitive": {}, "intransitive": {}}
    matcher = Matcher(nlp.vocab)

    # Pattern for transitive verb phrase (e.g. crash a car)
    transitive_pattern  = [{"POS": "VERB"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]
    matcher.add("transitive", [transitive_pattern])

    # Pattern for intransitive verbs phrase (e.g. I smiled at her)
    intransitive_pattern = [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]
    matcher.add("intransitive", [intransitive_pattern])

    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        match_type = nlp.vocab.strings[match_id]
        verb = span[0].text.lower()
        phrase = span[1:].text
        
        if verb in verb_phrases[match_type]:
            if phrase not in verb_phrases[match_type][verb]:
                verb_phrases[match_type][verb].append(phrase)
        else:
            verb_phrases[match_type][verb] = [phrase]
        
    return verb_phrases


In [10]:
# Remove the duplicates
def add_to_set(case_insensitive_set, items):
    for item in items:
        lower_item = item.lower()
        if lower_item not in case_insensitive_set:
            case_insensitive_set[lower_item] = item

In [12]:
def add_to_dict(v_large, v_add):
    for vp_type in ["transitive", "intransitive"]:
        for verb, phrases in v_add[vp_type].items():  # 'phrases' instead of 'phrase' for clarity
            if verb in v_large[vp_type]:
                for phrase in phrases:
                    if phrase not in v_large[vp_type][verb]:
                        v_large[vp_type][verb].append(phrase)
            else:
                v_large[vp_type][verb] = phrases.copy()  # Copy the whole list of phrases
    return v_large


In [13]:
# Load the transformer-based Spacy model
nlp = spacy.load("en_core_web_trf")

In [14]:
sentence_batch = flore_200['sentence']
total_len = len(sentence_batch)
len(sentence_batch)

1012

In [22]:
subjects = {}
verb_dict = {"transitive": {}, "intransitive": {}}

i = 1
for sentence in sentence_batch:
    doc = nlp(sentence)
    # Add items to the respective sets
    add_to_set(subjects, extract_general_subjects(doc))
    add_to_dict(verb_dict, extract_verbs_phrase(doc, nlp))
    i += 1

    if (i % 100 == 0):
        print(f"Processed {i} / {total_len}")
print(f"Processed {i-1} / {total_len}")

Processed 100 / 1012
Processed 200 / 1012
Processed 300 / 1012
Processed 400 / 1012
Processed 500 / 1012
Processed 600 / 1012
Processed 700 / 1012
Processed 800 / 1012
Processed 900 / 1012
Processed 1000 / 1012
Processed 1013 / 1012


In [27]:
subj_list = list(subjects.values())
subj_list.sort()
with open('subjects.txt', 'w') as file:
    for subject in subj_list:
        file.write(subject + '\n')
print("Subjects writing complete.")

Subjects writing complete.


In [28]:
import json

# Sort the keys within the nested dictionaries
sorted_verb_dict = {
    "transitive": {k: v for k, v in sorted(verb_dict["transitive"].items())},
    "intransitive": {k: v for k, v in sorted(verb_dict["intransitive"].items())},
}

with open('verb_phrases.json', 'w') as file:
    json.dump(sorted_verb_dict, file)
print("Transitive Verbs writing complete.")

Transitive Verbs writing complete.


### TEST AREA

In [None]:
def get_pos_tags(sentence):
    # Process the sentence
    doc = nlp(sentence)

    # Extract words and their POS tags
    word_info = [(token.text, token.pos_, token.dep_) for token in doc]

    return word_info

sent = "a man is eating pizza in the restaurant."

get_pos_tags(sent)

[('a', 'DET', 'det'),
 ('man', 'NOUN', 'nsubj'),
 ('is', 'AUX', 'aux'),
 ('eating', 'VERB', 'ROOT'),
 ('pizza', 'NOUN', 'dobj'),
 ('in', 'ADP', 'prep'),
 ('the', 'DET', 'det'),
 ('restaurant', 'NOUN', 'pobj'),
 ('.', 'PUNCT', 'punct')]

In [3]:
import json
with open('../raw_results/flore_200/verb_phrases.json', 'r') as f:
    j_dict = json.load(f)

tran_dict = j_dict['transitive']
intran_dict = j_dict['intransitive']
with open('verb_phrases.txt', 'w') as out:
    out.write("===Transitive Verb Phrases===\n")
    for verb, phrase_list in tran_dict.items():
        for phrase in phrase_list:
            out.write(f"{verb} {phrase}\n")
    out.write("===Intransitive Verb Phrases===\n")
    for verb, phrase_list in intran_dict.items():
        for phrase in phrase_list:
            out.write(f"{verb} {phrase}\n")