In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib as plt

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load Data

In [5]:
bbc_data=pd.read_csv('/content/bbc_news.csv')
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [7]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [8]:
titles=pd.DataFrame(bbc_data['title'])
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


# Clean Data


In [9]:
# Lower Case
titles['title']=titles['title'].str.lower()

In [10]:
# Stop word removal
en_stopwords=stopwords.words('english')
titles['no_stop_words']=titles['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [17]:
# Punctation removal
titles['no_stopwords_no_punct']=titles.apply(lambda x: re.sub(r'[^\w\s]','',x['no_stop_words']), axis=1)

In [18]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
# Tokenization
titles['tokens_raw']=titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)
titles['tokens_clean']=titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)

In [20]:
# Lemmatization
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
titles['tokens_clean_lammatized']=titles['tokens_clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
titles.head(10)

Unnamed: 0,title,no_stop_words,no_stopwords_no_punct,tokens_raw,tokens_clean,tokens_clean_lammatized
0,can i refuse to work?,refuse work?,refuse work,"[refuse, work]","[refuse, work]","[refuse, work]"
1,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"
5,red bull found guilty of breaking formula 1's ...,red bull found guilty breaking formula 1's bud...,red bull found guilty breaking formula 1s budg...,"[red, bull, found, guilty, breaking, formula, ...","[red, bull, found, guilty, breaking, formula, ...","[red, bull, found, guilty, breaking, formula, ..."
6,world triathlon championship series: flora duf...,world triathlon championship series: flora duf...,world triathlon championship series flora duff...,"[world, triathlon, championship, series, flora...","[world, triathlon, championship, series, flora...","[world, triathlon, championship, series, flora..."
7,terry hall: coventry scooter ride-out pays tri...,terry hall: coventry scooter ride-out pays tri...,terry hall coventry scooter rideout pays tribu...,"[terry, hall, coventry, scooter, rideout, pays...","[terry, hall, coventry, scooter, rideout, pays...","[terry, hall, coventry, scooter, rideout, pay,..."
8,post office and fujitsu to face inquiry over h...,post office fujitsu face inquiry horizon scandal,post office fujitsu face inquiry horizon scandal,"[post, office, fujitsu, face, inquiry, horizon...","[post, office, fujitsu, face, inquiry, horizon...","[post, office, fujitsu, face, inquiry, horizon..."
9,'pavement parking frightens me','pavement parking frightens me',pavement parking frightens me,"[pavement, parking, frightens, me]","[pavement, parking, frightens, me]","[pavement, parking, frightens, me]"


In [22]:
# Create list for just our tokens
tokens_raw_list= sum (titles['tokens_raw'], [])
tokens_clean_list= sum (titles['tokens_clean'], [])

# POS Tagging

In [28]:
nlp=spacy.load('en_core_web_sm')

In [29]:
spacy_doc=nlp(' '.join(tokens_raw_list))

In [30]:
pos_df=pd.DataFrame(columns=['token','pos_tag'])

In [31]:
for token in spacy_doc:
  pos_df= pd.concat([pos_df,
                     pd.DataFrame.from_records([{'token':token.text,
                                                 'pos_tag':token.pos_}])],ignore_index=True)

In [32]:
pos_df_counts=pos_df.groupby(['token','pos_tag']).size().reset_index(name='count').sort_values(by='count',ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,count
31,2022,NUM,47
1237,england,PROPN,40
935,cup,PROPN,36
3946,uk,PROPN,33
4096,war,NOUN,32
2544,new,ADJ,32
3262,says,VERB,30
4210,world,NOUN,28
3951,ukraine,VERB,28
4211,world,PROPN,26


In [33]:
nouns=pos_df_counts[pos_df_counts['pos_tag']=='NOUN'][0:10]
nouns

Unnamed: 0,token,pos_tag,count
4096,war,NOUN,32
4210,world,NOUN,28
2275,man,NOUN,23
2707,papers,NOUN,18
1233,energy,NOUN,17
3049,record,NOUN,17
2845,police,NOUN,16
971,day,NOUN,15
4128,week,NOUN,15
934,cup,NOUN,14


# NER

In [35]:
ner_df=pd.DataFrame(columns=['token','ner_tag'])
for token in spacy_doc.ents:
  if pd.isna(token.label_) is False:
    ner_df= pd.concat([ner_df,
                     pd.DataFrame.from_records([{'token':token.text,
                                                 'ner_tag':token.label_}])],ignore_index=True)

In [36]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,russian,NORP
1,70 years,DATE
2,70 seconds,TIME
3,bull,ORG
4,1s,CARDINAL


In [37]:
ner_df_counts=ner_df.groupby(['token','ner_tag']).size().reset_index(name='count').sort_values(by='count',ascending=False)

In [38]:
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,count
34,2022,CARDINAL,30
427,russian,NORP,22
219,first,ORDINAL,15
426,russia,GPE,10
35,2022,DATE,10
207,england,GPE,10
498,uk,GPE,10
154,china,GPE,9
227,france,GPE,9
490,tory,NORP,9


In [39]:
people = ner_df_counts[ner_df_counts['ner_tag']=='PERSON'][0:10]
people

Unnamed: 0,token,ner_tag,count
405,putin,PERSON,5
129,boris johnson,PERSON,5
102,andy murray,PERSON,3
494,tyre nichols,PERSON,2
242,harry kane,PERSON,2
243,harry meghan,PERSON,2
246,hodgkinson,PERSON,2
241,harry,PERSON,2
283,john caldwell,PERSON,2
297,jurgen klopp,PERSON,2
