## NLP Class Assignment 5

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys
import random
import spacy 
from spacy import displacy
from spacy.util import minibatch, compounding
from tqdm import tqdm
# spacy.prefer_gpu()
spacy.require_gpu()

print(spacy.__version__)

3.5.2


In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

#### Read news data

In [3]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

Sample contains 10,012 news articles


Unnamed: 0,url,date,language,title,text
0,http://kokomoperspective.com/obituaries/jon-w-horton/article_b6ba8e1e-cb9c-11eb-9868-fb11b88b9778.html,2021-06-13,en,Jon W. Horton | Obituaries | kokomoperspective.com,Jon W. Horton | Obituaries | kokomoperspective.comYou have permission to edit this article. EditCloseSign Up Log In Dashboard LogoutMy Account Dashboard Profile Saved items LogoutCOVID-19Click here for the latest local news on COVID-19HomeAbout UsContact UsNewsLocalOpinionPoliticsNationalStateAgricultureLifestylesEngagements/Anniversaries/WeddingsAutosEntertainmentHealthHomesOutdoorsSportsNFLNCAAVitalsObituariesAutomotivee-EditionCouponsGalleries74°...
1,https://auto.economictimes.indiatimes.com/news/auto-components/birla-precision-to-ramp-up-capacity-to-tap-emerging-opportunities-in-india/81254902,2021-02-28,en,"Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto","Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto We have updated our terms and conditions and privacy policy Click ""Continue"" to accept and continue with ET AutoAccept the updated privacy & cookie policyDear user, ET Auto privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy & our cookie ..."


#### Read Tweets data

In [4]:
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


## Using NLTK for NER on Tweets

In [5]:
tweets_df_nltk = tweets_df[['text']].copy()
news_df_nltk = news_df[['title','text']].copy()

In [6]:
tqdm.pandas()

In [7]:
def clean(string):
    url_pattern = r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    mention_pattern = r'[\s]*@[\w]+'
    hashtag_pattern = r'[\s]*#[\w]+'
    string_fixed = re.sub(url_pattern,"",string)
    string_fixed = re.sub(hashtag_pattern,"",string_fixed)
    string_fixed = re.sub(mention_pattern,"",string_fixed)
    string_fixed = string_fixed.replace('\n', '')
    tokens = string_fixed.split(" ")
    tokens = [word for word in tokens if len(word) > 1]
    tokens = [word for word in tokens if word.isalpha()]
    return(" ".join(tokens))

In [8]:
tweets_df_nltk['text'] = tweets_df_nltk['text'].progress_apply(clean)

100%|██████████| 10105/10105 [00:00<00:00, 95738.30it/s]


In [9]:
def find_company_nltk(string):
    tokens = nltk.word_tokenize(string)    
    entities = []
    for chunk in nltk.ne_chunk(nltk.pos_tag(tokens), binary = False):
        if hasattr(chunk, 'label'):
            entity = ' '.join(c[0] for c in chunk)
            label = chunk.label()
            if label == 'ORGANIZATION':
                entities.append(entity)
    return(entities)
    

In [10]:
def find_company_by_sentences_nltk(string):
    entities = []
    for sent in nltk.sent_tokenize(string):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                entity = ' '.join(c[0] for c in chunk)
                label = chunk.label()
                if label == 'ORGANIZATION':
                    entities.append(entity)
    return(entities)

In [11]:
tweets_df_nltk['companies'] = tweets_df_nltk['text'].progress_apply(find_company_nltk)

100%|██████████| 10105/10105 [00:24<00:00, 416.00it/s]


In [12]:
tweets_df_nltk['companies_sentences'] = tweets_df_nltk['text'].progress_apply(find_company_by_sentences_nltk)

100%|██████████| 10105/10105 [00:24<00:00, 415.01it/s]


Comparing the, per token and per sentence approach to NER

In [13]:
tweets_df_nltk[tweets_df_nltk['companies_sentences'].str.len() != 0][['companies','companies_sentences']].head(15)

Unnamed: 0,companies,companies_sentences
0,[Halewood],[Halewood]
4,[Land],[Land]
10,[ROVER],[ROVER]
11,[TEKNO],[TEKNO]
12,"[coversFull Service, ONO]","[coversFull Service, ONO]"
13,"[performanceAutumn Pride, Land Rover Handicap]","[performanceAutumn Pride, Land Rover Handicap]"
15,"[Land Rover Discovery, Jaguar]","[Land Rover Discovery, Jaguar]"
17,[ANYONE],[ANYONE]
18,"[Land Rover, CHUUMA]","[Land Rover, CHUUMA]"
20,"[And Buick Top, RAM And Land Rover Finish]","[And Buick Top, RAM And Land Rover Finish]"


### Let us Find out the Most frequent companies with and without sentence segmentation

In [14]:
tweets_df_nltk['companies'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies,count
0,Land Rover,980
1,Land,526
2,General Motors,283
3,Land Citroen,275
4,eBay,233
5,LAND,187
6,Duke,162
7,Duchess,145
8,ROVER,142
9,Jaguar Land,141


In [15]:
tweets_df_nltk['companies_sentences'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies_sentences,count
0,Land Rover,980
1,Land,526
2,General Motors,283
3,Land Citroen,275
4,eBay,233
5,LAND,187
6,Duke,162
7,Duchess,145
8,ROVER,142
9,Jaguar Land,141


As we can see, the company we are discussing about is clearly Land Rover. 

In [16]:
def company_filter(string):
    if len(string) >1 and 'Land Rover' in string or 'Jaguar Land Rover' in string:
        return string
    else:
        return pd.NA

### Finding Companies that accompany the main company

In [17]:
tweets_df_nltk['companies_sentences'].apply(company_filter).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies_sentences,count
0,Land Rover,395
1,Jaguar Land Rover,103
2,Clent Hills Vets,62
3,Duchess,40
4,Duke,36
5,Defender,27
6,ZANUPF,23
7,UK,18
8,Invictus,15
9,GOLD,14


In [18]:
tweets_df_nltk['companies'].apply(company_filter).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies,count
0,Land Rover,395
1,Jaguar Land Rover,103
2,Clent Hills Vets,62
3,Duchess,40
4,Duke,36
5,Defender,27
6,ZANUPF,23
7,UK,18
8,Invictus,15
9,GOLD,14


We can see that ebay or BMW accompanies Land Rover the most.

In [49]:
def find_location_nltk(string):
    entities = []
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(string)), binary = False):
        if hasattr(chunk, 'label'):
            entity = ' '.join(c[0] for c in chunk)
            label = chunk.label()
            if label == 'GPE':
                entities.append(entity)
    return(entities)
    

In [50]:
def find_location_by_sentence_nltk(string):
    entities = []
    for sent in nltk.sent_tokenize(string):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                entity = ' '.join(c[0] for c in chunk)
                label = chunk.label()
                if label == 'GPE':
                    entities.append(entity)
    return(entities)

In [21]:
tweets_df_nltk['locations'] = tweets_df_nltk['text'].progress_apply(find_location_nltk)

100%|██████████| 10105/10105 [00:24<00:00, 414.78it/s]


In [22]:
tweets_df_nltk['locations_sentences'] = tweets_df_nltk['text'].progress_apply(find_location_by_sentence_nltk)

100%|██████████| 10105/10105 [00:24<00:00, 413.98it/s]


In [23]:
tweets_df_nltk[(tweets_df_nltk['locations_sentences'].str.len() != 0) & (tweets_df_nltk['locations'].str.len() != 0) ][['locations','locations_sentences']].head(15)

Unnamed: 0,locations,locations_sentences
1,[Land],[Land]
2,[New Land],[New Land]
9,"[Russia, Bentley]","[Russia, Bentley]"
10,[LAND],[LAND]
13,[Fine],[Fine]
19,[Land],[Land]
20,[Kia],[Kia]
25,[Land],[Land]
26,[Land],[Land]
27,[LAND],[LAND]


As we can see, Land is confusing the nltk NER algorithm. 

## Using spacy for NER on Tweets

In [107]:
nlp = spacy.load("en_core_web_sm",exclude=['tok2vec','lemmatizer'])
nlp_senter = spacy.load("en_core_web_sm",exclude=['tok2vec','lemmatizer'])

In [25]:
nlp_senter.enable_pipe('senter')

In [26]:
nlp_senter.pipe_names

['tagger', 'parser', 'senter', 'attribute_ruler', 'ner']

In [27]:
tweets_df_spacy = tweets_df[['text']].copy()
news_df_spacy = news_df[['title','text']].copy()

In [28]:
tweets_df_spacy['text'] = tweets_df_spacy['text'].progress_apply(clean)

100%|██████████| 10105/10105 [00:00<00:00, 93276.91it/s]


In [29]:
title_entities = []
for doc in nlp.pipe(
    texts = tweets_df_spacy['text'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['ORG']:
            ents.append(ent.text)
    title_entities.append(ents)

In [30]:
title_entities_senter = []
for doc in nlp_senter.pipe(
    texts = tweets_df_spacy['text'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['ORG']:
            ents.append(ent.text)
    title_entities_senter.append(ents)

In [31]:
tweets_df_spacy['companies'] = title_entities
tweets_df_spacy['companies_sentences'] = title_entities_senter

In [32]:
tweets_df_spacy[tweets_df_spacy['companies_sentences'].str.len() != 0][['companies','companies_sentences']].head(15)

Unnamed: 0,companies,companies_sentences
0,"[Jaguar Land Rover, Ford]","[Jaguar Land Rover, Ford]"
3,[Please],[Please]
4,[Land Rover],[Land Rover]
5,"[Land Rover, Mainstream Property Services]","[Land Rover, Mainstream Property Services]"
6,"[Land Rover, Mainstream Property Services]","[Land Rover, Mainstream Property Services]"
7,"[Land Know, uache]","[Land Know, uache]"
8,[Land Rover],[Land Rover]
9,"[Land Lamborghini, Bentley]","[Land Lamborghini, Bentley]"
10,[eBay],[eBay]
11,[Land Rover],[Land Rover]


In [None]:
tweets_df_spacy['companies'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies,count
0,Land Rover,964
1,Jaguar Land Rover,549
2,General Motors,282
3,Jaguar Land Citroen,277
4,eBay,248
5,Jaguar Land,144
6,Ford,90
7,Volvo,78
8,the Jaguar Land Rover,77
9,Land,69


In [34]:
tweets_df_spacy['companies_sentences'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies_sentences,count
0,Land Rover,964
1,Jaguar Land Rover,549
2,General Motors,282
3,Jaguar Land Citroen,277
4,eBay,248
5,Jaguar Land,144
6,Ford,90
7,Volvo,78
8,the Jaguar Land Rover,77
9,Land,69


We can see that spacy also identifies land rover to be the company.

In [35]:
tweets_df_spacy['companies'].apply(company_filter).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies,count
0,Jaguar Land Rover,549
1,Land Rover,351
2,Volvo,58
3,Mainstream Property Services,40
4,TEKNOOFFICIAL,34
5,Ford,27
6,Tata Motors,25
7,Cayman,16
8,JDF,16
9,Nvidia,16


The company that seems to appear the most along with Land Rover is Mainstream Property Services

In [36]:
title_entities = []
for doc in nlp.pipe(
    texts = tweets_df_spacy['text'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['GPE']:
            ents.append(ent.text)
    title_entities.append(ents)

In [37]:
title_entities_senter = []
for doc in nlp_senter.pipe(
    texts = tweets_df_spacy['text'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['GPE']:
            ents.append(ent.text)
    title_entities_senter.append(ents)

In [38]:
tweets_df_spacy['locations'] = title_entities
tweets_df_spacy['locations_sentences'] = title_entities_senter

In [39]:
tweets_df_spacy[(tweets_df_spacy['locations_sentences'].str.len() != 0) & (tweets_df_spacy['locations'].str.len() != 0) ][['locations','locations_sentences']].head(15)

Unnamed: 0,locations,locations_sentences
3,[UK],[UK]
9,[Russia],[Russia]
18,[CHUUMA],[CHUUMA]
46,[Dublin],[Dublin]
51,"[india, india, india]","[india, india, india]"
57,[Land],[Land]
59,"[UP, UP]","[UP, UP]"
102,"[Tucson, Skoda]","[Tucson, Skoda]"
108,[Rome],[Rome]
113,[REPLACEMENT],[REPLACEMENT]


In [40]:
tweets_df_spacy['locations'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations,count
0,UK,175
1,Russia,108
2,Meghan,77
3,India,48
4,Land,39
5,Jamaica,35
6,China,24
7,London,23
8,Hollywood,23
9,Nyeri,23


In [41]:
tweets_df_spacy['locations_sentences'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations_sentences,count
0,UK,175
1,Russia,108
2,Meghan,77
3,India,48
4,Land,39
5,Jamaica,35
6,China,24
7,London,23
8,Hollywood,23
9,Nyeri,23


As we can see, while the spacy algorithm does confuse a few names, with Locations, it is on average able, to find locations much better.

### Thus for the Twitter Data the Spacy Algorithm is much better. There seems to be no difference between using a sentence segmenter.

##### The Top 20 companies with the Spacy Algorithm and with no sentence segmenter.

In [42]:
tweets_df_spacy['companies'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies,count
0,Land Rover,964
1,Jaguar Land Rover,549
2,General Motors,282
3,Jaguar Land Citroen,277
4,eBay,248
5,Jaguar Land,144
6,Ford,90
7,Volvo,78
8,the Jaguar Land Rover,77
9,Land,69


##### The Top 20 locations with the Spacy Algorithm and with no sentence segmenter.

In [43]:
tweets_df_spacy['locations'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations,count
0,UK,175
1,Russia,108
2,Meghan,77
3,India,48
4,Land,39
5,Jamaica,35
6,China,24
7,London,23
8,Hollywood,23
9,Nyeri,23


## Using NLTK for NER on News

In [60]:
stopwords = set(nltk.corpus.stopwords.words('english'))
def remove_stop_words(string):
    words = string.split(" ")
    words = [x for x in words if x not in stopwords]
    return " ".join(words)

In [61]:
news_df_nltk['text'] = news_df_nltk['text'].progress_apply(clean)
news_df_nltk['text'] = news_df_nltk['text'].progress_apply(remove_stop_words)

100%|██████████| 10012/10012 [00:06<00:00, 1563.50it/s]
100%|██████████| 10012/10012 [00:01<00:00, 5135.05it/s]


In [62]:
import polars as pl
news_df_nltk_pl = pl.from_pandas(news_df_nltk)

In [63]:
news_df_nltk_pl = news_df_nltk_pl.with_columns(
    (
        pl.col(["text"]).apply(find_company_nltk)
    ).alias("companies")
)

In [64]:
news_df_nltk_pl = news_df_nltk_pl.with_columns(
    (
        pl.col(["text"]).apply(find_company_by_sentences_nltk)
    ).alias("companies_sentences")
)

In [65]:
news_df_nltk_pl = news_df_nltk_pl.with_columns(
    (
        pl.col(["text"]).apply(find_location_nltk)
    ).alias("locations")
)

In [66]:
news_df_nltk_pl = news_df_nltk_pl.with_columns(
    (
        pl.col(["text"]).apply(find_location_by_sentence_nltk)
    ).alias("locations_sentences")
)

### The top 20 Companies on the article

In [67]:
news_df_nltk_pl['companies'].explode().value_counts().sort(by=['counts'],descending=True).head(20)

companies,counts
str,u32
"""MailOnline""",6865
"""VERY""",4993
"""NYC""",4383
"""COVID""",3942
"""LA""",3038
"""Princess""",2380
"""insuranceCar""",2062
"""usHow""",1976
"""Conditions Bac…",1961
"""MailMail""",1948


In [71]:
news_df_nltk_pl['companies_sentences'].explode().value_counts().sort(by=['counts'],descending=True).head(20)

companies_sentences,counts
str,u32
"""MailOnline""",6865
"""VERY""",4993
"""NYC""",4383
"""COVID""",3942
"""LA""",3038
"""Princess""",2380
"""insuranceCar""",2062
"""usHow""",1976
"""Conditions Bac…",1961
"""PrintsOur""",1948


In [88]:
news_df_nltk_pd = news_df_nltk_pl.to_pandas()

In [86]:
def company_filter_news(string):
    if len(string) >1 and 'MailOnline' in string:
        return string
    else:
        return None

In [89]:
news_df_nltk_pd['companies'].apply(company_filter_news).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies,count
0,MailOnline,6865
1,NYC,3848
2,VERY,3655
3,COVID,3263
4,LA,2453
5,Princess,2001
6,NOT,1527
7,PDA,1464
8,Princess Diana,1387
9,MailMail,1351


In [90]:
news_df_nltk_pd['companies_sentences'].apply(company_filter_news).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies_sentences,count
0,MailOnline,6865
1,NYC,3848
2,VERY,3655
3,COVID,3263
4,LA,2453
5,Princess,2001
6,NOT,1527
7,PDA,1464
8,Princess Diana,1387
9,MailMail,1351


In [91]:
news_df_nltk_pd['locations_sentences'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations_sentences,count
0,Los Angeles,7261
1,New York,5218
2,New York City,4989
3,British,4883
4,New,3689
5,London,2968
6,Prince,2942
7,American,2773
8,Australian,2663
9,China,2197


In [92]:
news_df_nltk_pd['locations'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations,count
0,Los Angeles,7261
1,New York,5218
2,New York City,4989
3,British,4883
4,New,3689
5,London,2968
6,Prince,2942
7,American,2773
8,Australian,2663
9,China,2197


## Using Spacy with News Text Data

In [105]:
spacy.require_cpu()

True

In [109]:
title_entities = []
for doc in nlp.pipe(
    texts = news_df_spacy['text'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['ORG']:
            ents.append(ent.text)
    title_entities.append(ents)

In [111]:
news_df_spacy['companies'] = title_entities

In [112]:
news_df_spacy['companies'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies,count
0,MailOnline,8849
1,COVID-19,6373
2,Ford,5624
3,Toyota,5475
4,Hyundai,4333
5,Instagram,4010
6,Trump,3877
7,Honda,3786
8,BMW,3783
9,Amazon,3595


We can see that spacy also identifies land rover to be the company.

In [113]:
news_df_spacy['companies'].apply(company_filter_news).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies,count
0,MailOnline,8849
1,Instagram,3106
2,Trump,2982
3,COVID-19,2918
4,Netflix,2753
5,COVID,2662
6,Palace,2659
7,Amazon,2484
8,Britney Spears,2370
9,House,1967


The company that seems to appear the most along with Land Rover is Mainstream Property Services

In [139]:
title_entities = []
for doc in nlp.pipe(
    texts = news_df_spacy['text'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['GPE']:
            ents.append(ent.text)
    title_entities.append(ents)

In [140]:
news_df_spacy['locations'] = title_entities

In [141]:
news_df_spacy['locations'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations,count
0,LA,18382
1,UK,10682
2,London,10504
3,Los Angeles,9930
4,US,9855
5,New York City,7231
6,Hollywood,5907
7,Australia,5104
8,India,5016
9,Miami,4842


## As we can see, the Spacy model is the best for the news text as well

## Using NLTK for NER on News Titl

In [114]:
news_df_nltk['title'] = news_df_nltk['title'].progress_apply(clean)

100%|██████████| 10012/10012 [00:00<00:00, 125109.85it/s]


In [115]:
news_df_nltk['companies_title'] = news_df_nltk['title'].progress_apply(find_company_nltk)

100%|██████████| 10012/10012 [00:16<00:00, 589.07it/s]


In [116]:
news_df_nltk['companies_title_sentences'] = news_df_nltk['title'].progress_apply(find_company_by_sentences_nltk)

100%|██████████| 10012/10012 [00:17<00:00, 578.33it/s]


### The top 20 Companies on the article

In [118]:
news_df_nltk['companies_title'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies_title,count
0,Star News,170
1,Daily Mail Online,155
2,Shropshire Star,94
3,Automotive News,85
4,BMW,79
5,Ford,68
6,RAM,53
7,Business Live,52
8,SUVs,44
9,NewsBreak,43


In [124]:
news_df_nltk['companies_title_sentences'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies_title_sentences,count
0,Star News,170
1,Daily Mail Online,155
2,Shropshire Star,94
3,Automotive News,85
4,BMW,79
5,Ford,68
6,RAM,53
7,Business Live,52
8,SUVs,44
9,NewsBreak,43


In [120]:
def company_filter_news_title(string):
    if len(string) >1 and 'Star News' in string:
        return string
    else:
        return None

In [122]:
news_df_nltk['companies_title'].apply(company_filter_news_title).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies_title,count
0,Star News,19
1,Canterbury,3
2,Christchurch,2
3,Addington,1
4,OCR,1
5,Bank,1
6,NZ,1
7,Emotional,1
8,ACC,1
9,Parliament,1


In [125]:
news_df_nltk['companies_title_sentences'].apply(company_filter_news).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies_title_sentences,count


In [127]:
news_df_nltk['locations_title'] = news_df_nltk['title'].progress_apply(find_location_nltk)

100%|██████████| 10012/10012 [00:17<00:00, 582.50it/s]


In [128]:
news_df_nltk['locations_sentences_title'] = news_df_nltk['title'].progress_apply(find_location_by_sentence_nltk)

100%|██████████| 10012/10012 [00:17<00:00, 573.09it/s]


In [129]:
news_df_nltk['locations_sentences_title'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations_sentences_title,count
0,Sale,1968
1,Ontario,1183
2,Manitoba,179
3,Prince,177
4,British Columbia,160
5,New,140
6,North,99
7,Alberta,95
8,India,89
9,Land,89


In [130]:
news_df_nltk['locations_title'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations_title,count
0,Sale,1968
1,Ontario,1183
2,Manitoba,179
3,Prince,177
4,British Columbia,160
5,New,140
6,North,99
7,Alberta,95
8,India,89
9,Land,89


## Using Spacy with News Title Data

#### As we saw before. We do not have to use the senter, as it has a minimal impact on the output.

In [131]:
title_entities = []
for doc in nlp.pipe(
    texts = news_df_spacy['title'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['ORG']:
            ents.append(ent.text)
    title_entities.append(ents)

In [132]:
news_df_spacy['companies_title'] = title_entities

In [133]:
news_df_spacy['companies_title'].explode().value_counts().reset_index().head(20)

Unnamed: 0,companies_title,count
0,Ford,270
1,Daily Mail Online,212
2,Star News,209
3,Hyundai,205
4,Toyota,162
5,Chevrolet,160
6,Honda,146
7,Shropshire Star,126
8,Express & Star,120
9,Automotive News,108


We can see that spacy also identifies land rover to be the company.

In [134]:
def company_filter_news_spacy(string):
    if len(string) >1 and 'Ford' in string:
        return string
    else:
        return None

In [135]:
news_df_spacy['companies_title'].apply(company_filter_news_spacy).dropna().explode().value_counts().reset_index().head(21)

Unnamed: 0,companies_title,count
0,Ford,78
1,Automotive News,6
2,VW,5
3,EV,5
4,Audi,4
5,NAV,4
6,GM,3
7,Local Trade,3
8,Specs & Features,3
9,BMW,3


The company that seems to appear the most along with Land Rover is Mainstream Property Services

In [136]:
title_entities = []
for doc in nlp.pipe(
    texts = news_df_spacy['title'],
):
    ents = []
    for ent in doc.ents:
        if ent.label_ in ['GPE']:
            ents.append(ent.text)
    title_entities.append(ents)

In [137]:
news_df_spacy['locations_title'] = title_entities

In [138]:
news_df_spacy['locations_title'].explode().value_counts().reset_index().head(20)

Unnamed: 0,locations_title,count
0,Ontario,1316
1,British Columbia,198
2,UK,194
3,Manitoba,181
4,Winnipeg,137
5,India,121
6,Toronto,118
7,Alberta,116
8,London,112
9,Cambridge,90


# Thus as we can see for the title, The Spacy Library has the best methods