In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd

In [2]:
data = pd.read_csv('true_news.csv')

In [3]:
data.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


We are going to create a new Dataframe just with the titles of the news


In [5]:
titles = pd.DataFrame(data['title'])

In [6]:
titles.head(10)

Unnamed: 0,title
0,"As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...
5,"White House, Congress prepare for talks on spe..."
6,"Trump says Russia probe will be fair, but time..."
7,Factbox: Trump on Twitter (Dec 29) - Approval ...
8,Trump on Twitter (Dec 28) - Global Warming
9,Alabama official to certify Senator-elect Jone...


Now, let's change everything to lowercase

In [10]:
titles['lowercase'] = titles['title'].str.lower()

In [11]:
titles['lowercase'].head(10)

0    as u.s. budget fight looms, republicans flip t...
1    u.s. military to accept transgender recruits o...
2    senior u.s. republican senator: 'let mr. muell...
3    fbi russia probe helped by australian diplomat...
4    trump wants postal service to charge 'much mor...
5    white house, congress prepare for talks on spe...
6    trump says russia probe will be fair, but time...
7    factbox: trump on twitter (dec 29) - approval ...
8           trump on twitter (dec 28) - global warming
9    alabama official to certify senator-elect jone...
Name: lowercase, dtype: object

Now, for better understanding, we are going to remove the stopwords

In [13]:
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [14]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords
0,"As U.S. budget fight looms, Republicans flip t...","as u.s. budget fight looms, republicans flip t...","u.s. budget fight looms, republicans flip fisc..."
1,U.S. military to accept transgender recruits o...,u.s. military to accept transgender recruits o...,u.s. military accept transgender recruits mond...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,senior u.s. republican senator: 'let mr. muell...,senior u.s. republican senator: 'let mr. muell...
3,FBI Russia probe helped by Australian diplomat...,fbi russia probe helped by australian diplomat...,fbi russia probe helped australian diplomat ti...
4,Trump wants Postal Service to charge 'much mor...,trump wants postal service to charge 'much mor...,trump wants postal service charge 'much more' ...


We also remove punctuation

In [15]:
titles['no_stopwords_no_punctuation'] = titles.apply(lambda x: re.sub(r"[^\w\s]", "", x['no_stopwords']), axis= 1)

In [16]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punctuation
0,"As U.S. budget fight looms, Republicans flip t...","as u.s. budget fight looms, republicans flip t...","u.s. budget fight looms, republicans flip fisc...",us budget fight looms republicans flip fiscal ...
1,U.S. military to accept transgender recruits o...,u.s. military to accept transgender recruits o...,u.s. military accept transgender recruits mond...,us military accept transgender recruits monday...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,senior u.s. republican senator: 'let mr. muell...,senior u.s. republican senator: 'let mr. muell...,senior us republican senator let mr mueller job
3,FBI Russia probe helped by Australian diplomat...,fbi russia probe helped by australian diplomat...,fbi russia probe helped australian diplomat ti...,fbi russia probe helped australian diplomat ti...
4,Trump wants Postal Service to charge 'much mor...,trump wants postal service to charge 'much mor...,trump wants postal service charge 'much more' ...,trump wants postal service charge much more am...


Now, we can tokenize

In [18]:
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punctuation']), axis=1)

In [19]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punctuation,tokens_raw,tokens_clean
0,"As U.S. budget fight looms, Republicans flip t...","as u.s. budget fight looms, republicans flip t...","u.s. budget fight looms, republicans flip fisc...",us budget fight looms republicans flip fiscal ...,"[As, U.S., budget, fight, looms, ,, Republican...","[us, budget, fight, looms, republicans, flip, ..."
1,U.S. military to accept transgender recruits o...,u.s. military to accept transgender recruits o...,u.s. military accept transgender recruits mond...,us military accept transgender recruits monday...,"[U.S., military, to, accept, transgender, recr...","[us, military, accept, transgender, recruits, ..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,senior u.s. republican senator: 'let mr. muell...,senior u.s. republican senator: 'let mr. muell...,senior us republican senator let mr mueller job,"[Senior, U.S., Republican, senator, :, 'Let, M...","[senior, us, republican, senator, let, mr, mue..."
3,FBI Russia probe helped by Australian diplomat...,fbi russia probe helped by australian diplomat...,fbi russia probe helped australian diplomat ti...,fbi russia probe helped australian diplomat ti...,"[FBI, Russia, probe, helped, by, Australian, d...","[fbi, russia, probe, helped, australian, diplo..."
4,Trump wants Postal Service to charge 'much mor...,trump wants postal service to charge 'much mor...,trump wants postal service charge 'much more' ...,trump wants postal service charge much more am...,"[Trump, wants, Postal, Service, to, charge, 'm...","[trump, wants, postal, service, charge, much, ..."


We now lemmatize the tokens

In [20]:
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmatized'] = titles['tokens_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [21]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punctuation,tokens_raw,tokens_clean,tokens_clean_lemmatized
0,"As U.S. budget fight looms, Republicans flip t...","as u.s. budget fight looms, republicans flip t...","u.s. budget fight looms, republicans flip fisc...",us budget fight looms republicans flip fiscal ...,"[As, U.S., budget, fight, looms, ,, Republican...","[us, budget, fight, looms, republicans, flip, ...","[u, budget, fight, loom, republican, flip, fis..."
1,U.S. military to accept transgender recruits o...,u.s. military to accept transgender recruits o...,u.s. military accept transgender recruits mond...,us military accept transgender recruits monday...,"[U.S., military, to, accept, transgender, recr...","[us, military, accept, transgender, recruits, ...","[u, military, accept, transgender, recruit, mo..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,senior u.s. republican senator: 'let mr. muell...,senior u.s. republican senator: 'let mr. muell...,senior us republican senator let mr mueller job,"[Senior, U.S., Republican, senator, :, 'Let, M...","[senior, us, republican, senator, let, mr, mue...","[senior, u, republican, senator, let, mr, muel..."
3,FBI Russia probe helped by Australian diplomat...,fbi russia probe helped by australian diplomat...,fbi russia probe helped australian diplomat ti...,fbi russia probe helped australian diplomat ti...,"[FBI, Russia, probe, helped, by, Australian, d...","[fbi, russia, probe, helped, australian, diplo...","[fbi, russia, probe, helped, australian, diplo..."
4,Trump wants Postal Service to charge 'much mor...,trump wants postal service to charge 'much mor...,trump wants postal service charge 'much more' ...,trump wants postal service charge much more am...,"[Trump, wants, Postal, Service, to, charge, 'm...","[trump, wants, postal, service, charge, much, ...","[trump, want, postal, service, charge, much, m..."


Now, we can create lists of the tokens

In [22]:
tokens_raw_list = [token for doc in titles['tokens_raw'] for token in doc]
tokens_clean_list = [token for doc in titles['tokens_clean'] for token in doc]

## POS Tagging

In [23]:
npl = spacy.load('en_core_web_sm')

Here we have to set the max_length property of the npl to a high number. Default number is 1000000

In [28]:
npl.max_length = 1500000

In [30]:
spacy_doc = npl(' '.join(tokens_raw_list)) #this could take a while if the list is large like this one

In [32]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

In [33]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

Now, we are trying to get all the words by it's function

In [36]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head()

Unnamed: 0,token,pos_tag,counts
17233,to,PART,5746
25,",",PUNCT,5287
497,:,PUNCT,5164
11552,in,ADP,4483
5557,Trump,PROPN,4409


And, now we have it, we can filter by it's function in the sentence

In [37]:
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
5555,Trump,NOUN,723
7084,bill,NOUN,653
16976,tax,NOUN,594
9658,election,NOUN,560
8723,deal,NOUN,504
8471,court,NOUN,461
16932,talks,NOUN,444
12147,leader,NOUN,408
10875,government,NOUN,405
17969,vote,NOUN,394


In [38]:
verbs = pos_df_counts[pos_df_counts.pos_tag == 'VERB'][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
15531,says,VERB,2960
7519,calls,VERB,364
5558,Trump,VERB,284
17808,urges,VERB,279
12638,meet,VERB,240
15526,say,VERB,239
15678,seeks,VERB,207
18025,wants,VERB,202
11997,killed,VERB,195
17025,tells,VERB,190


## NER

Here we are trying to get the kind of word

In [40]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df= pd.concat([ner_df, pd.DataFrame.from_records([{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)

In [41]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,U.S.,GPE
1,Republicans,NORP
2,U.S.,GPE
3,Monday,DATE
4,Pentagon,ORG


In [42]:
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [43]:
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
6873,U.S.,GPE,3332
6539,Trump,ORG,2564
6540,Trump,PERSON,1391
5672,Russia,GPE,886
1645,China,GPE,691
7248,White House,ORG,642
4804,North Korea,GPE,610
5532,Republican,NORP,578
1707,Clinton,PERSON,513
4909,Obama,PERSON,512


And we can also filter

In [44]:
people = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][0:10]
people

Unnamed: 0,token,ner_tag,counts
6540,Trump,PERSON,1391
1707,Clinton,PERSON,513
4909,Obama,PERSON,512
1347,Brexit,PERSON,238
5386,Putin,PERSON,223
4265,Merkel,PERSON,131
5714,Ryan,PERSON,128
6829,Twitter,PERSON,113
3988,Macron,PERSON,109
1767,Comey,PERSON,91
