# NLP Module 22 - Practical Exercise
## Ingest, cleanse, and analyze BBC news

### Tokenization

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd

In [12]:
data = pd.read_csv('resources/bbc_news.csv')
data.info()
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...
5,5,8967,Red Bull found guilty of breaking Formula 1's ...,"Mon, 10 Oct 2022 15:23:47 GMT",https://www.bbc.co.uk/sport/formula1/63204082,https://www.bbc.co.uk/sport/formula1/63204082?...,Red Bull are found guilty of breaking Formula ...
6,6,10858,World Triathlon Championship Series: Flora Duf...,"Fri, 25 Nov 2022 14:22:09 GMT",https://www.bbc.co.uk/sport/triathlon/63756049,https://www.bbc.co.uk/sport/triathlon/63756049...,Bermuda's Flora Duffy gets the better of Brita...
7,7,14914,Terry Hall: Coventry scooter ride-out pays tri...,"Sun, 19 Mar 2023 16:42:11 GMT",https://www.bbc.co.uk/news/uk-england-coventry...,https://www.bbc.co.uk/news/uk-england-coventry...,Hundreds of people ride through Coventry on wh...
8,8,8934,Post Office and Fujitsu to face inquiry over H...,"Mon, 10 Oct 2022 21:00:45 GMT",https://www.bbc.co.uk/news/business-63205335,https://www.bbc.co.uk/news/business-63205335?a...,The public inquiry is taking further evidence ...
9,9,4879,'Pavement parking frightens me',"Fri, 01 Jul 2022 23:02:09 GMT",https://www.bbc.co.uk/news/uk-england-nottingh...,https://www.bbc.co.uk/news/uk-england-nottingh...,"Ted, who is blind, hopes people will reconside..."


In [13]:
# cleanse 1. lowercase 2. remove stopwords 3. remove punct
data['title_lowercase'] = data['title'].apply(lambda x: x.lower())
data['title_lowercase'][:5]

0                                can i refuse to work?
1    'liz truss the brief?' world reacts to uk poli...
2    rationing energy is nothing new for off-grid c...
3    the hunt for superyachts of sanctioned russian...
4    platinum jubilee: 70 years of the queen in 70 ...
Name: title_lowercase, dtype: object

In [38]:
stop_words = set(stopwords.words('english'))
stop_words.remove('not')
data['title_lowercase_no_stopwords'] = data['title_lowercase'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
data['title_lowercase_no_stopwords'][:10]

0                                         refuse work?
1    'liz truss brief?' world reacts uk political t...
2      rationing energy nothing new off-grid community
3        hunt superyachts sanctioned russian oligarchs
4          platinum jubilee: 70 years queen 70 seconds
5    red bull found guilty breaking formula 1's bud...
6    world triathlon championship series: flora duf...
7    terry hall: coventry scooter ride-out pays tri...
8     post office fujitsu face inquiry horizon scandal
9                      'pavement parking frightens me'
Name: title_lowercase_no_stopwords, dtype: object

In [18]:
data['title_lowercase_no_punct'] = data['title_lowercase'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['title_lowercase_no_punct'][:5]

0                                 can i refuse to work
1    liz truss the brief world reacts to uk politic...
2    rationing energy is nothing new for offgrid co...
3    the hunt for superyachts of sanctioned russian...
4    platinum jubilee 70 years of the queen in 70 s...
Name: title_lowercase_no_punct, dtype: object

In [19]:
# tokenization
data['tokenized'] = data.apply(lambda row: word_tokenize(row['title_lowercase_no_punct']), axis=1)
data['tokenized'][:5]

0                           [can, i, refuse, to, work]
1    [liz, truss, the, brief, world, reacts, to, uk...
2    [rationing, energy, is, nothing, new, for, off...
3    [the, hunt, for, superyachts, of, sanctioned, ...
4    [platinum, jubilee, 70, years, of, the, queen,...
Name: tokenized, dtype: object

In [21]:
# lemmartization
wl = WordNetLemmatizer()
data['lemmatized'] = data['tokenized'].apply(lambda tokens: [wl.lemmatize(token) for token in tokens])
data['lemmatized'][:5]

0                           [can, i, refuse, to, work]
1    [liz, truss, the, brief, world, reacts, to, uk...
2    [rationing, energy, is, nothing, new, for, off...
3    [the, hunt, for, superyachts, of, sanctioned, ...
4    [platinum, jubilee, 70, year, of, the, queen, ...
Name: lemmatized, dtype: object

In [22]:
# extract all lemmars into a list
lemmars_clean = sum(data['lemmatized'], [])
print(lemmars_clean[:50])

['can', 'i', 'refuse', 'to', 'work', 'liz', 'truss', 'the', 'brief', 'world', 'reacts', 'to', 'uk', 'political', 'turmoil', 'rationing', 'energy', 'is', 'nothing', 'new', 'for', 'offgrid', 'community', 'the', 'hunt', 'for', 'superyachts', 'of', 'sanctioned', 'russian', 'oligarch', 'platinum', 'jubilee', '70', 'year', 'of', 'the', 'queen', 'in', '70', 'second', 'red', 'bull', 'found', 'guilty', 'of', 'breaking', 'formula', '1', 'budget']


In [27]:
# statistics
unigram = pd.Series(nltk.ngrams(lemmars_clean, 1)).value_counts()
bigram = pd.Series(nltk.ngrams(lemmars_clean, 2)).value_counts()
trigram = pd.Series(nltk.ngrams(lemmars_clean, 3)).value_counts()
#print(unigram[:10])
#print(bigram[:10])
print(trigram[:10])

(world, cup, 2022)       21
(cost, of, living)       15
(world, cup, england)     9
(what, is, the)           7
(of, the, week)           7
(quiz, of, the)           5
(week, in, picture)       5
(england, v, south)       5
(v, south, africa)        5
(t20, world, cup)         5
Name: count, dtype: int64


### Tagging

In [28]:
import spacy

In [29]:
nlp = spacy.load('en_core_web_sm')

In [36]:
spacy_doc = nlp(' '.join(lemmars_clean))
print(spacy_doc)



In [37]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

In [43]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)
pos_df.head(10)

Unnamed: 0,token,pos_tag
0,can,AUX
1,i,PRON
2,refuse,VERB
3,to,PART
4,work,VERB
5,liz,PROPN
6,truss,VERB
7,the,DET
8,brief,ADJ
9,world,NOUN


In [None]:
# counts
count_b

In [44]:
from spacy import displacy

In [45]:
displacy.render(spacy_doc, style='ent', jupyter=True)