In [2]:
import spacy
import pandas as pd
from string import punctuation

import nltk
from nltk import ngrams
from nltk.corpus import wordnet
from collections import Counter
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load("en_core_web_lg")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeremychua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
df = pd.read_csv('data/test/preprocess_output.csv')

## A. Get contextual features

In [32]:
df['word_count'] = df['title'].str.split().str.len()
df['character_count'] = df['title'].str.len()

In [33]:
n = 2
# Function to generate n-grams for the title
def generate_ngrams(text, n):
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Generate n-grams
    return list(ngrams(tokens, n))

df['bigrams'] = df['title'].apply(lambda title: generate_ngrams(title, n))

In [34]:
def get_contextual_features(title):
    doc = nlp(title)
    lemma = []
    pos = []
    tag = []
    dep = []
    label = []
    
    for token in doc:
        if token.text in punctuation:
            continue
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        tag.append(token.tag_)
        dep.append(token.dep_)
        label.append(token.ent_type_)
        
    return lemma, pos, tag, dep, label

In [35]:
lemma = []
pos = []
tag = []
dep = []
label = []

for idx, row in df.iterrows():
    title = row['title']
    l, p, t, d, la = get_contextual_features(title)
    lemma.append(l)
    pos.append(p)
    tag.append(t)
    dep.append(d)
    label.append(la)

df['lemma'] = lemma
df['pos'] = pos
df['tag'] = tag
df['dep'] = dep
df['label'] = label

## B. Get trigger words

In [83]:
def contains_digit(word):
    for char in word:
        if char.isdigit():
            return True
    return False

def extract_trigger_words(title):
    result = []
    pos_tag = ['ADJ', 'NOUN', 'VERB', 'ADV', 'NNP', 'PROPN'] 
    # tag_ls = ['NN', 'NNP', 'NNPS']
    label_type = ['TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
    doc = nlp(title[0].lower()+title[1:])

    prev_label = None
    trigger_combi = None
    for token in doc:
        # print(token.text, token.ent_type_, token.pos_, token.tag_)
        if token.text in nlp.Defaults.stop_words or token.text in punctuation or contains_digit(token.text):
            continue
        if (token.pos_ in pos_tag) and (token.ent_type_ not in label_type): # or (token.tag_ in tag_ls)
            if prev_label:
                if prev_label == token.ent_type_:
                    trigger_combi += '_' + token.text
                    continue
                else:
                    if trigger_combi:
                        result.append(trigger_combi.lower())
                        trigger_combi = None
                        prev_label = None
                    result.append(token.text.lower())
                    continue
            
            if len(token.ent_type_) > 0:
                prev_label = token.ent_type_
                trigger_combi = token.text
            else:
                if trigger_combi:
                    result.append(trigger_combi.lower())
                    trigger_combi = None
                    prev_label = None
                result.append(token.text.lower())
        
    if trigger_combi:
        result.append(trigger_combi.lower())
        trigger_combi = None
        prev_label = None

    return result

In [84]:
extract_trigger_words("2019 ASEAN Para Games")

['asean_para_games']

In [85]:
df['trigger_words'] = df['title'].apply(extract_trigger_words)

In [86]:
df = df[df['trigger_words'].map(len) > 0]

## B. Get TF-IDF of trigger words

In [87]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['title'])
words = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
word_scores = dict(zip(words, tfidf_scores))

In [88]:
df['context_score'] = None
for index, row in df.iterrows():
    score = {}
    for word in row['trigger_words']:
        if word in word_scores:
            score[word] = word_scores[word]
        else:
            score[word] = 0
            
    if len(score) > 1:
        max_score = max(score.values())
        if max_score > 0:
            for key in score:
                score[key] = score[key] / max_score
    
    elif len(score) == 1:
        for key in score:
            score[key] = 1.0
        
    score = {k: v for k, v in score.items() if v > 0.5}
    
    df.at[index, 'context_score'] = score

In [89]:
df = df[df['context_score'].map(len) > 0]

In [90]:
df.to_csv('data/test/output_with_context_score.csv', index=False)

## C. Get category of trigger words 

In [110]:
def words_relatedness(word1, word2):
    # Process the words with spaCy
    token1 = nlp(word1)
    token2 = nlp(word2)

    max_similarity = 0
    
    # Iterate through all tokens of each word
    for t1 in token1:
        for t2 in token2:
            similarity = t1.similarity(t2)
            if similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity


In [113]:
word1 = words_relatedness('asean', 'sports')
word2 = words_relatedness('para', 'sports')
word3 = words_relatedness('games', 'sports')

avg = (word1 + word2 + word3) / 3
print(avg)

0.23937943577766418


  similarity = t1.similarity(t2)


In [114]:
print(words_relatedness('earthquake', 'environment'))
print(words_relatedness('earthquake', 'health'))

0.2699144184589386
0.16935458779335022


In [115]:
def find_representative_word(words_to_check):
    common_hypernyms = Counter()

    for word in words_to_check:
        synsets = wordnet.synsets(word)
        for synset in synsets:
            common_hypernyms.update(synset.hypernyms())

    if not common_hypernyms:
        return None
    
    return common_hypernyms.most_common(1)[0][0].lemma_names()[0]


In [164]:
context_threshold = 0.7

def get_category_row(current_categories, df_row):
    print(df_row['title'])
    cat_word_scores = {}
    scores = {word:score for word, score in df_row['context_score'].items() if score > context_threshold}
    for category in current_categories:
        for word in scores.keys():
            if len(word.split("_")) > 1:
                subwords = word.split("_")
                for subword in subwords:
                    cat_word_scores[(category, subword)] = words_relatedness(subword, category)
            else:
                cat_word_scores[(category, word)] = words_relatedness(word, category)
    
    print(sorted(cat_word_scores.items(), key=lambda x: x[1], reverse=True))
    max_score = max(cat_word_scores.values())
    new_category = [k[0] for k, v in cat_word_scores.items() if v == max_score][0]
    return current_categories, new_category


def get_category_df(df, current_categories):
    df['category'] = None
    for idx, row in df.iterrows():
        try:
            current_categories, category = get_category_row(current_categories, row)
            df.at[idx, 'category'] = category
        except Exception as e:
            print("Error at %d: %s" % (idx, e))
            print(current_categories)
            break
    
    return df, current_categories

In [169]:
categories = ["business", "politics", "technology", "entertainment", "sports", "lifestyle", "health", "science", "education", "editorial", "international", "environment", "crime"]

In [172]:
get_category_row(categories, df.iloc[45])

Western Air Express Flight 7
[(('international', 'western'), 0.4049071967601776), (('politics', 'western'), 0.3474322557449341), (('entertainment', 'western'), 0.30807703733444214), (('education', 'western'), 0.3065800964832306), (('environment', 'western'), 0.28821176290512085), (('science', 'western'), 0.266686350107193), (('sports', 'western'), 0.26235148310661316), (('technology', 'western'), 0.2398119568824768), (('lifestyle', 'western'), 0.23076753318309784), (('business', 'western'), 0.20046818256378174), (('editorial', 'western'), 0.1918196827173233), (('crime', 'western'), 0.1251412034034729), (('health', 'western'), 0.12266942858695984)]


(['business',
  'politics',
  'technology',
  'entertainment',
  'sports',
  'lifestyle',
  'health',
  'science',
  'education',
  'editorial',
  'international',
  'environment',
  'crime'],
 'international')

In [30]:
df, current_categories = get_category_df(df, categories)

KeyboardInterrupt: 

In [None]:
df.columns

Index(['title', 'word_count', 'character_count', 'bigrams', 'lemma', 'pos',
       'tag', 'dep', 'label', 'trigger_words', 'context_score', 'category'],
      dtype='object')

In [None]:
df = df[['title', 'word_count', 'character_count', 'bigrams', 'lemma', 'pos', 'tag', 'dep', 'label', 'context_score', 'trigger_words']]

In [None]:
df.to_csv('data/test/output_with_category.csv', index=False)