In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib as plt

nltk.download('stopwords')
nltk.download('punkt')



# Load Data

In [None]:
bbc_data=pd.read_csv('/content/bbc_news.csv')
bbc_data.head()

In [None]:
bbc_data.info()

In [None]:
titles=pd.DataFrame(bbc_data['title'])
titles.head()

# Clean Data


In [None]:
# Lower Case
titles['title']=titles['title'].str.lower()

In [None]:
# Stop word removal
en_stopwords=stopwords.words('english')
titles['no_stop_words']=titles['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [None]:
# Punctation removal
titles['no_stopwords_no_punct']=titles.apply(lambda x: re.sub(r'[^\w\s]','',x['no_stop_words']), axis=1)

In [None]:
nltk.download('punkt_tab')


In [None]:
# Tokenization
titles['tokens_raw']=titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)
titles['tokens_clean']=titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)

In [None]:
# Lemmatization
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
titles['tokens_clean_lammatized']=titles['tokens_clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [None]:
titles.head(10)

In [None]:
# Create list for just our tokens
tokens_raw_list= sum (titles['tokens_raw'], [])
tokens_clean_list= sum (titles['tokens_clean'], [])

# POS Tagging

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
spacy_doc=nlp(' '.join(tokens_raw_list))

In [None]:
pos_df=pd.DataFrame(columns=['token','pos_tag'])

In [None]:
for token in spacy_doc:
  pos_df= pd.concat([pos_df,
                     pd.DataFrame.from_records([{'token':token.text,
                                                 'pos_tag':token.pos_}])],ignore_index=True)

In [None]:
pos_df_counts=pos_df.groupby(['token','pos_tag']).size().reset_index(name='count').sort_values(by='count',ascending=False)
pos_df_counts.head(10)

In [None]:
nouns=pos_df_counts[pos_df_counts['pos_tag']=='NOUN'][0:10]
nouns

# NER

In [None]:
ner_df=pd.DataFrame(columns=['token','ner_tag'])
for token in spacy_doc.ents:
  if pd.isna(token.label_) is False:
    ner_df= pd.concat([ner_df,
                     pd.DataFrame.from_records([{'token':token.text,
                                                 'ner_tag':token.label_}])],ignore_index=True)

In [None]:
ner_df.head()

In [None]:
ner_df_counts=ner_df.groupby(['token','ner_tag']).size().reset_index(name='count').sort_values(by='count',ascending=False)

In [None]:
ner_df_counts.head(10)

In [None]:
people = ner_df_counts[ner_df_counts['ner_tag']=='PERSON'][0:10]
people