In [1]:
%run -i "../util/lang_utils.ipynb"

In [2]:
from datasets import load_dataset
from nltk import word_tokenize
from math import ceil
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [3]:
train_dataset = load_dataset("SetFit/bbc-news", split="train")
test_dataset = load_dataset("SetFit/bbc-news", split="test")
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
train_df.head()

Unnamed: 0,text,label,label_text
0,wales want rugby league training wales could f...,2,sport
1,china aviation seeks rescue deal scandal-hit j...,1,business
2,rock band u2 break ticket record u2 have smash...,3,entertainment
3,markets signal brazilian recovery the brazilia...,1,business
4,tough rules for ringtone sellers firms that fl...,0,tech


In [4]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english',
                             min_df=2, max_df=0.95)
vectorizer.fit(train_df['text'])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [5]:
# Sort a coordinate matrix by TD-IDF score
def sort_data_tfidf_score(coord_matrix):
    tuples = zip(coord_matrix.col, coord_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [6]:
# Retrieve top keywords from sorted_vector
def get_keyword_strings(vectorizer, num_words, sorted_vector):
    # sorted_vector list of (index, score) tuples
    words = []
    # get_feature_names_out returns ndarray where position is feature index and value is feature name
    index_dict = vectorizer.get_feature_names_out()
    # take the top num_words items
    for (item_index, score) in sorted_vector[0:num_words]:
        word = index_dict[item_index]
        words.append(word)
    return words

In [7]:
# Extract top keywords from input text
def get_keywords_simple(vectorizer, input_text, num_words=10):
    vector = vectorizer.transform([input_text])
    sorted_vector = sort_data_tfidf_score(vector.tocoo())
    keywords = get_keyword_strings(vectorizer, num_words, sorted_vector)
    return keywords

In [8]:
example = test_df.iloc[0]["text"]
print(example)
keywords_example = get_keywords_simple(vectorizer, example, num_words=10)
print(keywords_example)

carry on star patsy rowlands dies actress patsy rowlands  known to millions for her roles in the carry on films  has died at the age of 71.  rowlands starred in nine of the popular carry on films  alongside fellow regulars sid james  kenneth williams and barbara windsor. she also carved out a successful television career  appearing for many years in itv s well-loved comedy bless this house. rowlands died in hove on saturday morning  her agent said.  born in january 1934  rowlands won a scholarship to the guildhall school of speech and drama scholarship when she was just 15.  after spending several years at the players theatre in london  she made her film debut in 1963 in tom jones  directed by tony richardson. she made her first carry on film in 1969 where she appeared in carry on again doctor. rowlands played the hard-done-by wife or the put-upon employee as a regular carry on star. she also appeared in carry on at your convenience  carry on matron and carry on loving  as well as othe

In [9]:
train_df.head()


Unnamed: 0,text,label,label_text
0,wales want rugby league training wales could f...,2,sport
1,china aviation seeks rescue deal scandal-hit j...,1,business
2,rock band u2 break ticket record u2 have smash...,3,entertainment
3,markets signal brazilian recovery the brazilia...,1,business
4,tough rules for ringtone sellers firms that fl...,0,tech


In [10]:
# A more sophisticated example
stop_words = list(stopwords.words('english'))
stop_words.remove("the")
trigram_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                     stop_words=stop_words,
                                     min_df=2, max_df=0.95)
trigram_vectorizer.fit(train_df['text'])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [14]:
# Sort all words from sorted_vector, based on TF-IDF score
def get_keyword_strings_all(vectorizer, sorted_vector):
    # sorted_vector list of (index, score) tuples
    words = []
    # get_feature_names_out returns ndarray where position is feature index and value is feature name
    index_dict = vectorizer.get_feature_names_out()
    for (item_index, score) in sorted_vector:
        word = index_dict[item_index]
        words.append(word)
    return words

In [17]:
def get_keywords_complex(
    vectorizer, input_text, spacy_model, num_words=70):
    keywords = []
    doc = spacy_model(input_text)
    vector = vectorizer.transform([input_text])
    sorted_vector = sort_data_tfidf_score(vector.tocoo())
    ngrams = get_keyword_strings_all(vectorizer, sorted_vector)
    # get spaCy noun chunks
    ents = [ent.text.lower() for ent in doc.noun_chunks]
    for i in range(0, num_words):
        keyword = ngrams[i]  
        if keyword.lower() in ents and not keyword.isdigit() and keyword not in keywords:
            keywords.append(keyword)
    return keywords
    

In [18]:
small_model = spacy.load("en_core_web_sm")
example = test_df.iloc[0]["text"]
keywords_example2 = get_keywords_complex(trigram_vectorizer, example, small_model)
print(keywords_example2)

['carry', 'films', 'stage', 'several years', 'saturday morning', 'star', 'film', 'london', 'beauty', 'the good', 'many years', 'directors']
