In [238]:
import spacy
import pandas as pd
from string import punctuation

nlp = spacy.load("en_core_web_sm")


In [239]:
df = pd.read_csv('data/test/preprocess_output.csv')

## A. Get contextual features

In [240]:
def get_contextual_features(title):
    doc = nlp(title)
    lemma = []
    pos = []
    tag = []
    dep = []
    label = []
    
    for token in doc:
        if token.text in punctuation:
            continue
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        tag.append(token.tag_)
        dep.append(token.dep_)
        label.append(token.ent_type_)
        
    return lemma, pos, tag, dep, label

In [241]:
lemma = []
pos = []
tag = []
dep = []
label = []

for idx, row in df.iterrows():
    title = row['title']
    l, p, t, d, la = get_contextual_features(title)
    lemma.append(l)
    pos.append(p)
    tag.append(t)
    dep.append(d)
    label.append(la)

df['lemma'] = lemma
df['pos'] = pos
df['tag'] = tag
df['dep'] = dep
df['label'] = label

## B. Get trigger words

In [242]:
def contains_digit(word):
    for char in word:
        if char.isdigit():
            return True
    return False

def extract_trigger_words(title):
    result = []
    pos_tag = ['ADJ', 'NOUN', 'VERB', 'ADV', 'NNP'] 
    tag_ls = ['NN', 'NNP']
    label_type = ['TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
    doc = nlp(title[0].lower()+title[1:])

    for token in doc:
        if token.text in nlp.Defaults.stop_words or token.text in punctuation or contains_digit(token.text):
            continue
        #print(token.text, token.pos_, token.tag_, token.dep_, token.ent_type_)
        if((token.pos_ in pos_tag) or (token.tag_ in tag_ls)) and (token.ent_type_ not in label_type):
            result.append(token.text.lower())
            
    return result

In [243]:
extract_trigger_words("Michael Jackson memorial service")

['michael', 'jackson', 'memorial', 'service']

In [244]:
df['trigger_words'] = df['title'].apply(extract_trigger_words)

In [245]:
df = df[df['trigger_words'].map(len) > 0]

## B. Get TF-IDF of trigger words

In [246]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['title'])
words = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
word_scores = dict(zip(words, tfidf_scores))

In [247]:
df['context_score'] = None
for index, row in df.iterrows():
    score = {}
    for word in row['trigger_words']:
        if word in word_scores:
            score[word] = word_scores[word]
        else:
            score[word] = 0
            
    if len(score) > 0:
        max_score = max(score.values())
        if max_score > 0:
            for key in score:
                score[key] = score[key] / max_score
        
    score = {k: v for k, v in score.items() if v > 0.5}
    
    df.at[index, 'context_score'] = score

In [248]:
df = df[df['context_score'].map(len) > 0]

In [249]:
df.to_csv('data/test/output_with_context_score.csv', index=False)

## C. Get category of trigger words 

In [250]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [251]:
from nltk.corpus import wordnet
from collections import Counter

In [252]:
def words_relatedness(word1, word2):
    max_similarity = 0
    
    # Iterate through all synsets of each word
    for synset1 in wordnet.synsets(word1):
        for synset2 in wordnet.synsets(word2):
            similarity = synset1.wup_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity


In [253]:
def find_representative_word(words_to_check):
    common_hypernyms = Counter()

    for word in words_to_check:
        synsets = wordnet.synsets(word)
        for synset in synsets:
            common_hypernyms.update(synset.hypernyms())

    if not common_hypernyms:
        return None
    
    return common_hypernyms.most_common(1)[0][0].lemma_names()[0]


In [254]:
def get_category_row(current_categories, df_row):
    if len(current_categories) > 0:
        cat_word_scores = {}
        for category in current_categories:
            for word in df_row['context_score'].keys():
                cat_word_scores[(category, word)] = words_relatedness(word, category)
        
        max_score = max(cat_word_scores.values())
        if max_score > 0.8:
            new_category = [k[0] for k, v in cat_word_scores.items() if v == max_score][0]
            return current_categories, new_category
        
    new_category = find_representative_word(df_row['context_score'].keys())
    if new_category == None:
        new_category = max(df_row['context_score'], key=df_row['context_score'].get)
    
    if new_category not in current_categories:
        current_categories.append(new_category)
        
    return current_categories, new_category


def get_category_df(df, current_categories):
    df['category'] = None
    for idx, row in df.iterrows():
        try:
            current_categories, category = get_category_row(current_categories, row)
            df.at[idx, 'category'] = category
        except Exception as e:
            print("Error at %d: %s" % (idx, e))
            print(current_categories)
            break
    
    return df, current_categories

In [255]:
category_df, current_categories = get_category_df(df, [])

In [256]:
df = df[['title', 'lemma', 'pos', 'tag', 'dep', 'label', 'context_score', 'trigger_words']]

In [257]:
category_df.to_csv('data/test/output_with_category.csv', index=False)