In [None]:
!pip install datasets
!pip install torch
!pip install spacy
!pip install spacy-transformers

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
import spacy
from spacy.matcher import Matcher

In [None]:
def extract_general_subjects(doc):
    named_entities = set(ent.text for ent in doc.ents)  # Exclude named entities.
    general_subjects = []
    for chunk in doc.noun_chunks:
        # Filter based on dependency tags and exclude personal pronouns.
        if chunk.root.dep_ in ["nsubj", "nsubjpass"] and chunk.root.pos_ != "PRON":
            # Shorter check for 'a' or 'the' and exclusion of named entities.
            if chunk.text.lower().startswith(('a ', 'the ')) and not any(ne in chunk.text or chunk.text in ne for ne in named_entities):
                # Extract and append the noun without the determiner.
                subject = chunk.text.split(' ', 1)[1]  # Removes the first word (a/the).
                general_subjects.append(subject)
    return general_subjects

In [None]:
def extract_verbs_phrase(doc, nlp):
    verb_phrases = []
    matcher = Matcher(nlp.vocab)
    # Pattern for transitive and intransitive verb phrases
    transitive_pattern = [{"POS": "VERB"}, {"POS": "DET", "OP": "?"}, {"POS": "PRON", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN", "OP": "+"}]
    intransitive_pattern = [{"POS": "VERB"}, {"POS": "ADP"}, {"POS": "DET", "OP": "?"}, {"POS": "PRON", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN", "OP": "+"}]

    # Add patterns to matcher
    matcher.add("verb_phrases", [transitive_pattern, intransitive_pattern])
    matches = matcher(doc)
    for match_id, start, end in matches:
        span_text = doc[start:end].text.lower()
        found_subphrase = False 
        for i in range(len(verb_phrases)): # find if there is a sub-phrase in the list
            phrase = verb_phrases[i]
            if (phrase in span_text) and len(phrase) < len(span_text):
                verb_phrases[i] = span_text
                found_subphrase = True
            
        if not found_subphrase:
            verb_phrases.append(span_text)

    return verb_phrases

In [None]:
# Remove the duplicates
def add_to_set(case_insensitive_set, items):
    for item in items:
        lower_item = item.lower()
        case_insensitive_set[lower_item] = case_insensitive_set.get(lower_item, 0) + 1

In [None]:
from collections import Counter
def get_top_n_items(dictionary, n=150):
    counter = Counter(dictionary)
    return counter.most_common(n)

In [None]:
# Load the transformer-based Spacy model
nlp = spacy.load("en_core_web_trf")

In [None]:
doc = nlp("You like New York in Autumn.")
for chunk in doc.noun_chunks:
    # Filter based on dependency tags and exclude personal pronouns.
    print(f"Chunk is [{chunk.text}] and Chunk's root is [{chunk.root.text}] and Chunk's root POS is [{chunk.root.pos_}] and Chunk's root DEP is [{chunk.root.dep_}]")
    # print(f"Chunk is {chunk.text} and Chunk's root is {chunk.root.text}")


In [None]:
# Input file. The format should be one sentence per line.
with open("../../test_space/omcs-sentences-more-en.txt", 'r') as file1:
    sentences = [line.strip() for line in file1.readlines()]

In [None]:
# total_len = len(sentences)
# total_len

In [None]:
sentence_batch = sentences[:100]
total_len = len(sentence_batch)
len(sentence_batch)

In [None]:
subjects = {}
verb_phrases = {}

i = 1
for sentence in sentence_batch:
    doc = nlp(sentence)
    # Add items to the respective sets
    add_to_set(subjects, extract_general_subjects(doc))
    add_to_set(verb_phrases, extract_verbs_phrase(doc, nlp))
    i += 1

    if (i % 1000 == 0):
        print(f"Processed {i} / {total_len}")
print(f"Processed {i-1} / {total_len}")

In [None]:
# Filter to keep top 150 for each category
top_subjects = get_top_n_items(subjects, 500)
top_verb_phrases = get_top_n_items(verb_phrases, 500)

In [None]:
with open('../raw_results/omcs/subjects.txt', 'w') as file:
    for subject, freq in top_subjects:
        file.write(f"{subject}: {freq}\n")
print("Subjects writing complete.")

In [None]:
with open('../raw_results/omcs/verb_phrases.txt', 'w') as file:
    for vp, freq in top_verb_phrases:
        file.write(f"{vp}: {freq}\n")
print("Verb phrases writing complete.")

In [None]:
import requests
from collections import defaultdict

def get_related_verbs(nouns):
    base_url = "http://api.conceptnet.io/query"
    results = defaultdict(list)

    for noun in nouns:
        params = {
            "rel": "/r/CapableOf",
            "start": f"/c/en/{noun}",
            "limit": 1000  # Adjust limit as needed to fetch more results initially
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            edges = data.get("edges", [])
            for edge in edges:
                verb = edge.get("end", {}).get("label", "")
                weight = edge.get("weight", 0)
                results[noun].append((verb, weight))

            # Sort the verbs for each noun by weight in descending order and keep the top 10
            results[noun] = sorted(results[noun], key=lambda x: x[1], reverse=True)[:5]
        else:
            print(f"Failed to fetch data for {noun}")

    return results

In [None]:
related_verbs_ = get_related_verbs(subjects)
for noun, verbs in related_verbs_.items():
    print(f"{noun.capitalize()} can:")
    for verb, weight in verbs:
        print(f"  - {verb} (Weight: {weight})")

In [None]:
for noun, verbs in related_verbs_.items():
    # print(f"{noun.capitalize()} can:")
    for verb, weight in verbs:
        print(f"{verb}")

In [None]:
# Open a new text file for writing
# file_path = Path("/mnt/data/filtered_verbs.txt")
filtered_verbs = set()
for noun, verbs in related_verbs_.items():
    for verb in verbs:
        sent = f"{noun.capitalize()} can {verb[0]}"
        doc = nlp(sent)
        filtered_verbs.add(extract_verbs_phrase(doc, nlp))
            # for phrase in filtered_verbs:

with open('/home/liu/test_prior/raw_results/omcs/filtered/verb_phrases_filtered_v2_add_5.txt', "w") as file:
    for verb in list(filtered_verbs):
        file.write(verb + "\n")