In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import pandas as pd
import re
import spacy

### Load data

In [None]:
bbc_news_df = pd.read_csv('bbc_news.csv')
bbc_news_df.head()

### since we are going to work in TITLE only

In [None]:
titles_only_df = pd.DataFrame(bbc_news_df['title'])
titles_only_df.head()

### Lowercase

In [None]:
titles_only_df['title_lowercase'] = titles_only_df['title'].str.lower()

### Stop word removal

In [None]:
en_stopwords = stopwords.words('english')

titles_only_df['title_no_stopwords'] = titles_only_df['title_lowercase'] \
                                .apply(lambda x: ' ' \
                                .join([word for word \
                                in x.split() if word \
                                not in (en_stopwords)]))

titles_only_df

In [None]:
# punctation removal
titles_only_df['title_no_punc'] = titles_only_df.apply(lambda x: re.sub(r"([^\w\s])", "", x['title_no_stopwords']), axis=1)
titles_only_df.head()

### Tokenise

In [10]:
def tokenizing(word):
    if word is None or (isinstance(word, float)):  # catches NaN
        return []
    return word_tokenize(str(word))

titles_only_df['title_tokenised'] = titles_only_df['title_no_punc'].apply(tokenizing)
titles_only_df['title_raw_tokenised'] = titles_only_df['title'].apply(tokenizing)
titles_only_df.head()

Unnamed: 0,title,title_lowercase,title_no_stopwords,title_no_punc,title_tokenised,title_lemmatised,title_raw_tokenised
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[refuse, work]","[refuse, work]","[Can, I, refuse, to, work, ?]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic...","['Liz, Truss, the, Brief, ?, ', World, reacts,..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com...","[Rationing, energy, is, nothing, new, for, off..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga...","[The, hunt, for, superyachts, of, sanctioned, ..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]","[Platinum, Jubilee, :, 70, years, of, the, Que..."


### Lemmatization

In [9]:
lemmatizer = WordNetLemmatizer()
def lemmatization(word):
    return [lemmatizer.lemmatize(text) for text in word]

titles_only_df['title_lemmatised'] = titles_only_df['title_tokenised'].apply(lemmatization)
titles_only_df.head()

Unnamed: 0,title,title_lowercase,title_no_stopwords,title_no_punc,title_tokenised,title_lemmatised
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [11]:
# create lists for just our tokens
tokens_raw_list = sum(titles_only_df['title_raw_tokenised'], []) #unpack our lists into a single list
tokens_clean_list = sum(titles_only_df['title_tokenised'], [])

### POS Tagging

In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
spacy_doc = nlp(' '.join(tokens_raw_list))

In [14]:
# extract the tokens and pos tags into a dataframe
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df,
                       pd.DataFrame.from_records([{'token': token.text,'pos_tag': token.pos_}])], ignore_index=True)

In [15]:
pos_df

Unnamed: 0,token,pos_tag
0,Can,AUX
1,I,PRON
2,refuse,VERB
3,to,PART
4,work,VERB
...,...,...
11742,sale,NOUN
11743,scams,NOUN
11744,",",PUNCT
11745,consumers,NOUN


In [16]:
# token frequency count
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
95,:,PUNCT,543
8,',PUNCT,300
2897,in,ADP,187
4082,to,PART,175
3268,of,ADP,172
22,-,PUNCT,166
4043,the,DET,163
1856,and,CCONJ,147
15,'s,PART,143
97,?,PUNCT,130


In [17]:
# see most common nouns
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
4267,war,NOUN,35
3552,record,NOUN,15
3416,police,NOUN,14
4356,year,NOUN,14
4316,win,NOUN,14
3061,living,NOUN,13
4009,tax,NOUN,13
2326,day,NOUN,12
3368,people,NOUN,12
2031,boss,NOUN,11


### NER

In [18]:
# extract the tokens and entity tags into a dataframe
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)

In [19]:
ner_df

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP
...,...,...
1665,Frenkie de Jong,PERSON
1666,Manchester United,PERSON
1667,Barcelona,GPE
1668,Dominic Raab,PERSON


In [20]:
# token frequency count
ner_df_counts = ner_df.groupby(['token','ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19
1031,World Cup 2022,EVENT,18
1058,first,ORDINAL,13
918,The Papers,WORK_OF_ART,13
378,France,GPE,12
226,China,GPE,11
