In [None]:
import spacy
import pandas as pd
from string import punctuation

import nltk
from nltk import ngrams
from nltk.corpus import wordnet
from collections import Counter
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df = pd.read_csv('data/test/preprocess_output.csv')

## A. Get contextual features

In [None]:
df['word_count'] = df['title'].str.split().str.len()
df['character_count'] = df['title'].str.len()

In [None]:
n = 2
# Function to generate n-grams for the title
def generate_ngrams(text, n):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Generate n-grams
    return list(ngrams(tokens, n))

df['bigrams'] = df['title'].apply(lambda title: generate_ngrams(title, n))

In [None]:
def get_contextual_features(title):
    doc = nlp(title)
    lemma = []
    pos = []
    tag = []
    dep = []
    label = []
    
    for token in doc:
        if token.text in punctuation:
            continue
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        tag.append(token.tag_)
        dep.append(token.dep_)
        label.append(token.ent_type_)
        
    return lemma, pos, tag, dep, label

In [None]:
lemma = []
pos = []
tag = []
dep = []
label = []

for idx, row in df.iterrows():
    title = row['title']
    l, p, t, d, la = get_contextual_features(title)
    lemma.append(l)
    pos.append(p)
    tag.append(t)
    dep.append(d)
    label.append(la)

df['lemma'] = lemma
df['pos'] = pos
df['tag'] = tag
df['dep'] = dep
df['label'] = label

## B. Get trigger words

In [None]:
def contains_digit(word):
    for char in word:
        if char.isdigit():
            return True
    return False

def extract_trigger_words(title):
    result = []
    pos_tag = ['ADJ', 'NOUN', 'VERB', 'ADV', 'NNP'] 
    tag_ls = ['NN', 'NNP']
    label_type = ['TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
    doc = nlp(title[0].lower()+title[1:])

    prev_label = None
    trigger_combi = None
    for token in doc:
        # print(token.text, token.ent_type_)
        if token.text in nlp.Defaults.stop_words or token.text in punctuation or contains_digit(token.text):
            continue
        if((token.pos_ in pos_tag) or (token.tag_ in tag_ls)) and (token.ent_type_ not in label_type):
            if prev_label:
                if prev_label == token.ent_type_:
                    trigger_combi += '_' + token.text
                    continue
                else:
                    if trigger_combi:
                        result.append(trigger_combi.lower())
                        trigger_combi = None
                        prev_label = None
                    result.append(token.text.lower())
                    continue
            
            if len(token.ent_type_) > 0:
                prev_label = token.ent_type_
                trigger_combi = token.text
            else:
                if trigger_combi:
                    result.append(trigger_combi.lower())
                    trigger_combi = None
                    prev_label = None
                result.append(token.text.lower())
            
    if trigger_combi:
        result.append(trigger_combi.lower())
        trigger_combi = None
        prev_label = None

    return result

In [None]:
extract_trigger_words("Glasgow St Enoch rail accident")

glasgow 
St PERSON
Enoch PERSON
rail 
accident 


['glasgow', 'st enoch', 'rail', 'accident']

In [None]:
df['trigger_words'] = df['title'].apply(extract_trigger_words)

2006 DATE
Pangandaran GPE
earthquake 
and 
tsunami 
battle 
of 
Santa GPE
Clara GPE
( 
1927 DATE
) 
siege 
of 
Pondicherry PERSON
( 
1793 DATE
) 
battle 
of 
Leuthen ORG
glasgow 
St PERSON
Enoch PERSON
rail 
accident 
murder 
of 
Lee PERSON
Rigby PERSON
siege 
of 
Bayonne ORG
air 
Florida GPE
Flight 
90 
2015 DATE
ASEAN ORG
Para 
Games 
lufthansa 
CityLine PRODUCT
Flight PRODUCT
5634 PRODUCT
first ORDINAL
Taiwan ORG
Strait ORG
Crisis ORG
night TIME
of 
Champions GPE
( 
2014 DATE
) 
kursk PRODUCT
submarine 
disaster 
death 
of 
Christopher PERSON
Alder PERSON
death 
of 
Eric PERSON
Garner PERSON
gas 
attacks 
at 
Hulluch ORG
crazy 
Nights WORK_OF_ART
World WORK_OF_ART
Tour 
death 
of 
Michael PERSON
Jackson PERSON
battle 
of 
Saint LOC
- LOC
Mihiel LOC
2011 DATE
Sri ORG
Lanka ORG
Premier 
League 
chappaquiddick ORG
incident ORG
2012–15 CARDINAL
unrest 
in 
Romania GPE
2016 DATE
Masters 
Tournament 
operation 
Nasr PERSON
thanh PERSON
Hóa PERSON
Bridge PERSON
birmingham GPE
campaign 
we

In [None]:
df = df[df['trigger_words'].map(len) > 0]

## B. Get TF-IDF of trigger words

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['title'])
words = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
word_scores = dict(zip(words, tfidf_scores))

In [None]:
df['context_score'] = None
for index, row in df.iterrows():
    score = {}
    for word in row['trigger_words']:
        if word in word_scores:
            score[word] = word_scores[word]
        else:
            score[word] = 0
            
    if len(score) > 1:
        max_score = max(score.values())
        if max_score > 0:
            for key in score:
                score[key] = score[key] / max_score
    
    elif len(score) == 1:
        for key in score:
            score[key] = 1.0
        
    score = {k: v for k, v in score.items() if v > 0.5}
    
    df.at[index, 'context_score'] = score

In [None]:
df = df[df['context_score'].map(len) > 0]

In [None]:
df.to_csv('data/test/output_with_context_score.csv', index=False)

## C. Get category of trigger words 

In [None]:
def words_relatedness(word1, word2):
    max_similarity = 0
    
    # Iterate through all synsets of each word
    for synset1 in wordnet.synsets(word1):
        for synset2 in wordnet.synsets(word2):
            similarity = synset1.wup_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity


In [None]:
def find_representative_word(words_to_check):
    common_hypernyms = Counter()

    for word in words_to_check:
        synsets = wordnet.synsets(word)
        for synset in synsets:
            common_hypernyms.update(synset.hypernyms())

    if not common_hypernyms:
        return None
    
    return common_hypernyms.most_common(1)[0][0].lemma_names()[0]


In [None]:
context_threshold = 0.8

def get_category_row(current_categories, df_row):
    cat_word_scores = {}
    for category in current_categories:
        scores = {word:score for word, score in df_row['context_score'].items() if score > context_threshold}
        for word in df_row['context_score'].keys():
            cat_word_scores[(category, word)] = words_relatedness(word, category)
    
    max_score = max(cat_word_scores.values())
    new_category = [k[0] for k, v in cat_word_scores.items() if v == max_score][0]
    return current_categories, new_category


def get_category_df(df, current_categories):
    df['category'] = None
    for idx, row in df.iterrows():
        try:
            current_categories, category = get_category_row(current_categories, row)
            df.at[idx, 'category'] = category
        except Exception as e:
            print("Error at %d: %s" % (idx, e))
            print(current_categories)
            break
    
    return df, current_categories

In [None]:
categories = ["business", "politics", "technology", "entertainment", "sports", "world", "lifestyle", "health", "science", "education", "editorial", "international", "environment"]

In [None]:
get_category_row(categories, df.iloc[0])

(['business',
  'politics',
  'technology',
  'entertainment',
  'sports',
  'world',
  'lifestyle',
  'health',
  'science',
  'opinions',
  'disaster'],
 'disaster')

In [None]:
df, current_categories = get_category_df(df, categories)

In [None]:
df.columns

Index(['title', 'word_count', 'character_count', 'bigrams', 'lemma', 'pos',
       'tag', 'dep', 'label', 'trigger_words', 'context_score', 'category'],
      dtype='object')

In [None]:
df = df[['title', 'word_count', 'character_count', 'bigrams', 'lemma', 'pos', 'tag', 'dep', 'label', 'context_score', 'trigger_words']]

In [None]:
df.to_csv('data/test/output_with_category.csv', index=False)