## NLP Class Assignment 5

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform
from nltk.probability import FreqDist
import re


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
#!pip install pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 8
INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


#### Read news data

In [4]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

Sample contains 10,012 news articles


Unnamed: 0,url,date,language,title,text
0,http://kokomoperspective.com/obituaries/jon-w-horton/article_b6ba8e1e-cb9c-11eb-9868-fb11b88b9778.html,2021-06-13,en,Jon W. Horton | Obituaries | kokomoperspective.com,Jon W. Horton | Obituaries | kokomoperspective.comYou have permission to edit this article. EditCloseSign Up Log In Dashboard LogoutMy Account Dashboard Profile Saved items LogoutCOVID-19Click here for the latest local news on COVID-19HomeAbout UsContact UsNewsLocalOpinionPoliticsNationalStateAgricultureLifestylesEngagements/Anniversaries/WeddingsAutosEntertainmentHealthHomesOutdoorsSportsNFLNCAAVitalsObituariesAutomotivee-EditionCouponsGalleries74°...
1,https://auto.economictimes.indiatimes.com/news/auto-components/birla-precision-to-ramp-up-capacity-to-tap-emerging-opportunities-in-india/81254902,2021-02-28,en,"Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto","Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto We have updated our terms and conditions and privacy policy Click ""Continue"" to accept and continue with ET AutoAccept the updated privacy & cookie policyDear user, ET Auto privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy & our cookie ..."


In [5]:
news_df.shape

(10012, 5)

In [6]:
news_df['text'].nunique()

9984

In [7]:
news_df = news_df.drop_duplicates(subset=['text']).reset_index(drop=True)
news_df.shape

(9984, 5)

#### Read Tweets data

In [8]:
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


In [9]:
tweets_df.shape

(10105, 6)

In [10]:
tweets_df.retweeted.value_counts()

retweeted
      5094
RT    5011
Name: count, dtype: int64

In [11]:
tweets_df['text'].nunique()

6696

In [12]:
# Removing retweets from this analysis
tweets_df=tweets_df[tweets_df.retweeted!='RT']
tweets_df = tweets_df.drop_duplicates(subset=['text']).reset_index(drop=True)
tweets_df.shape

(4957, 6)

In [13]:
#!pip install langdetect
# Discarding non-English results
from langdetect import detect

def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

    
news_df = news_df[news_df['text'].parallel_apply(is_english)]
tweets_df = tweets_df[tweets_df['text'].parallel_apply(is_english)]

In [71]:
# Define a function to clean the text data
def clean_text(text, type):
    # Remove URLs, mentions, hashtags and newline characters
    text = re.sub(r'http\S+|www.\S+|@\S+|\n', '', text)
    text = re.sub(r'#\w+', '', text)
    
    if type == "tweet" :
        # Use TweetTokenizer to tokenize
        tweet_tokenizer = nltk.tokenize.TweetTokenizer()
        tokens = tweet_tokenizer.tokenize(text)
    else:
        tokens = nltk.word_tokenize(text)
    
    # Remove single-character tokens (mostly punctuation)
    tokens = [token for token in tokens if len(token) > 1]

    # Remove numbers
    tokens = [token for token in tokens if not token.isnumeric()]

    # Remove punctuation
    # tokens = [token for token in tokens if token.isalpha()]

    # Remove all punctuation except periods, exclamation marks, and question marks
    tokens = [re.sub(r'[^\w\s.!?]', '', token) for token in tokens]

    # Remove stop words and lemmatize the words
    stop_words = set(nltk.corpus.stopwords.words('english'))   
    
    wnl = nltk.WordNetLemmatizer()
    tokens=[wnl.lemmatize(token) for token in tokens if not token in stop_words]
    
    # Join the tokens back into a string
    clean_text = ' '.join(tokens)
    return clean_text



In [79]:
# Apply the clean_text function
news_df['clean_text'] = news_df['text'].parallel_apply(lambda x: clean_text(x, "text"))
news_df['clean_title'] = news_df['title'].parallel_apply(lambda x: clean_text(x, "text"))
tweets_df['clean_text'] = tweets_df['text'].parallel_apply(lambda x: clean_text(x, "tweet"))


Dataframe sizes post cleaning and before NER :

In [73]:
tweets_df.shape

(4538, 7)

In [18]:
news_df.shape

(9983, 7)

##### Common Functions :

In [104]:
#NLTK TOKENISE

def nltk_word(df, col):
    entities_op = []
    for doc in df[col]:
        tokens = word_tokenize(doc)
        entities = nltk.chunk.ne_chunk(nltk.pos_tag(tokens), binary = False)
        for entity in entities:
            if isinstance(entity, nltk.tree.Tree) and entity.label() in ['ORGANIZATION']:
                entities_op.append(' '.join([leaf[0] for leaf in entity]))

    entities_op = [x.lower() for x in entities_op]
    entities_freqdist = pd.DataFrame(FreqDist(entities_op).most_common(20), columns=['Entity', 'Frequency'])
    return entities_freqdist

def nltk_sent(df, col):
    entities_op = []
    for doc in df[col]:
        # Tokenize the article into sentences
        sentences = nltk.sent_tokenize(doc)
        for sentence in sentences:
            # Tokenize each sentence into words
            tokens = nltk.word_tokenize(sentence)
            # Apply named entity recognition on the tokens
            entities = nltk.chunk.ne_chunk(nltk.pos_tag(tokens))
            for entity in entities:
                # Check if the entity is an organization
                if isinstance(entity, nltk.tree.Tree) and entity.label() in ['ORGANIZATION']:
                    # Append the entity to the list
                    entities_op.append(' '.join([leaf[0] for leaf in entity]))

    entities_op = [x.lower() for x in entities_op]
    entities_freqdist = pd.DataFrame(FreqDist(entities_op).most_common(20), columns=['Entity', 'Frequency'])
    return entities_freqdist

In [93]:
#SPACY TOKENISE

def spacy_word(df, col):
    entities_op = []
    for doc in nlp.pipe(
    df[col].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute _ruler", "lemmatizer"], 
    batch_size=200, 
    n_process=2 ):
        for ent in doc.ents:
            if ent.label_ in ['ORG', 'PRODUCT', 'COMPANY' ]:
                entities_op.append(ent.text.lower())

    entities_freqdist = pd.DataFrame(FreqDist(entities_op).most_common(20), columns=['Entity', 'Frequency'])
    return entities_freqdist



def spacy_sent(df, col):
    sent_nlp = spacy.load("en_core_web_sm")
    sent_nlp.add_pipe('sentencizer')
    entities_op = []
    for doc in sent_nlp.pipe(
    df[col].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute _ruler", "lemmatizer"], 
    batch_size=200, 
    n_process=2 ):
        for sent in doc.sents:  # sentence segmentation
            sent_entities = []
            for token in sent:  # POS tagging
                if token.ent_type_ in ['ORG', 'PRODUCT', 'COMPANY']:
                    sent_entities.append(token.text.lower())
            entities_op.extend(sent_entities)

    entities_freqdist = pd.DataFrame(FreqDist(entities_op).most_common(20), columns=['Entity', 'Frequency'])
    return entities_freqdist


### NER USING NLTK
##### TWEETS

In [105]:
nltk_word(tweets_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,land,365
1,land rover,303
2,ebay,114
3,rover,99
4,suv,32
5,bmw,21
6,jaguar land,20
7,land rover discovery,16
8,nvidia,14
9,tdv,13


In [106]:
nltk_sent(tweets_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,land,365
1,land rover,303
2,ebay,114
3,rover,99
4,suv,32
5,bmw,21
6,jaguar land,20
7,land rover discovery,16
8,nvidia,14
9,tdv,13


### NER USING NLTK
##### NEWS - TITLE

In [107]:
nltk_word(news_df, 'clean_title')

Unnamed: 0,Entity,Frequency
0,star news,166
1,shropshire star,96
2,automotive news,94
3,mercedesbenz,89
4,daily mail online,71
5,bmw,70
6,ford f150,63
7,business live,57
8,ram,52
9,covid19,42


In [108]:
nltk_sent(news_df, 'clean_title')


Unnamed: 0,Entity,Frequency
0,star news,166
1,shropshire star,96
2,automotive news,94
3,mercedesbenz,89
4,daily mail online,71
5,bmw,70
6,ford f150,63
7,business live,57
8,ram,52
9,covid19,42


Tried 50% and 30% of news articles but since NLTK was taking a lot of time (more than 30 minutes without any result) and the results obtained from the title weren't as satisfactory either, performed NER on 10% of the news articles

### NER USING NLTK
##### NEWS - ARTICLE (SAMPLED)

In [109]:
news_df_sampled = news_df.sample(frac=0.1, random_state=42)
nltk_word(news_df_sampled, 'clean_text')


Unnamed: 0,Entity,Frequency
0,mailonline,795
1,very,703
2,covid19,556
3,nyc,552
4,covid,390
5,la,317
6,princess diana,297
7,uscontact,254
8,uk,244
9,edt,243


In [110]:
nltk_sent(news_df_sampled, 'clean_text')


Unnamed: 0,Entity,Frequency
0,mailonline,795
1,very,703
2,covid19,556
3,nyc,552
4,covid,390
5,la,313
6,princess diana,297
7,uscontact,254
8,uk,243
9,edt,243


### NER USING SPACY
##### TWEETS

In [None]:
spacy_word(tweets_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,land rover,752
1,ebay,271
2,jaguar land rover,253
3,land rover defender,57
4,rover,47
5,ford,40
6,bmw,35
7,land rover discovery,31
8,toyota,27
9,discovery,24


In [None]:
spacy_sent(tweets_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,rover,1802
1,land,1755
2,jaguar,457
3,ebay,282
4,discovery,119
5,defender,119
6,tata,65
7,the,58
8,bmw,57
9,ford,54


### NER USING SPACY
##### NEWS - TITLE 

In [None]:
spacy_word(news_df, 'clean_title')

Unnamed: 0,Entity,Frequency
0,ford,229
1,hyundai,208
2,toyota,203
3,chevrolet,179
4,star news,158
5,honda,144
6,british columbia carpages.ca,106
7,automotive news,98
8,nissan,95
9,bmw,95


In [None]:
spacy_sent(news_df, 'clean_title')

Unnamed: 0,Entity,Frequency
0,news,824
1,carpages.ca,744
2,daily,572
3,ontario,571
4,online,567
5,mail,464
6,star,420
7,ford,368
8,auto,214
9,hyundai,212


### NER USING SPACY
##### NEWS - ARTICLE

In [None]:
spacy_word(news_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,mailonline,8131
1,toyota,4978
2,ford,4256
3,hyundai,4211
4,britney spears,4005
5,instagram,3639
6,km,3535
7,honda,3462
8,bmw,3175
9,amazon,3098


In [None]:
spacy_sent(news_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,the,16683
1,news,9662
2,mailonline,8687
3,land,7030
4,royal,6987
5,house,6850
6,ford,6839
7,rover,6700
8,toyota,5316
9,km,5216


##### Analysis and Comparison of the different NER methods implemented :

#### <b>Tweets </b>
NER using NLTK with and without sentence segmentation did not give us entities that were insightful into what the overarching theme in the tweets were. The top ORG entity identified was <b>land</b> and it was unable to accurately tag land rover. 
However, <b>NER using spaCy without sentence segmentation </b>shows that 7 of the top 20 companies were about <b>Land Rover</b> with <b>Land Rover</b> being identified as our top company.

#### <b>News - Titles </b>
NER using NLTK on News Titles did not give us any helpful results as all the top entities were the companies reporting the news and not the companies in context of the news.
We can see that with <b>NER using spaCy</b> we were able to find the top entities to be <b>Ford, Hyundai, Toyota and Chevrolet</b> and this gives us the clear idea that the news articles are about <b>automobiles.</b>


#### <b>News - Text </b>
When we go ahead and take a look at the entities extracted for the News Articles, we can observe that NLTK failed to identify anything related to automobiles in the top 20 entities even though we have identified from the titles that the tweets are about automobiles.
<b>NER using spaCy</b> identified the top entity in the New articles to be <b>mailonline</b> which we can safely disregard as it is referencing the news publisher Daily Mail's online website, which is the source of the news and it does not contribute to identifying the top company being reported in the news. Here, we can observe that <b>Toyota, Ford and Hyundai</b> seem to be the top company entities.

##### Hence we can conclude that <b>NER using spaCy without sentence segmentation</b> gave us the best results and we were able to identify the top companies in order to understand what was being reported in the news and tweeted widely in the dataset.


<i>1. Identify what is this company name, by looking at the entity distributions across both tweets and news articles</i>

From the entity distributions displayed above, by taking a look at the entities generated using word tokenization at a document level using Spacy, we can identify that the Company being talked about the most are as follows :

##### Tweets : <b>Land Rover</b>
##### News Articles and Titles : <b>Toyota, Ford, Hyundai</b>


<i>2. Identify what other companies are most frequently mentioned along with your primary company
* Analyze what companies are most frequently mentioned within the same document (tweet and news article)
* While analyzing news articles, extract separate entities from titles and texts</i>

##### Most Frequent companies mentioned along with <b>Land Rover</b> in Tweets

In [118]:
# Filter all tweets containing Land Rover in order to find other frequently mentioned companies in the same context
land_rover_tweets = tweets_df[tweets_df['text'].str.contains('land rover', case=False)]

similar_comp_tweets = spacy_word(land_rover_tweets, 'clean_text')

pattern = re.compile('(land(rover| rovers)?|rover)', flags=re.IGNORECASE)
filtered_similar_comp_tweets = similar_comp_tweets[~similar_comp_tweets['Entity'].str.extract(pattern, expand=False).notna().any(axis=1)]

filtered_similar_comp_tweets.reset_index(drop=True)

Unnamed: 0,Entity,Frequency
1,ebay,271
5,ford,40
6,bmw,35
8,toyota,27
9,discovery,24
10,jaguar,23
11,tata,22
12,tata motors,18
13,jlr,17
14,ev,16


We can see that along with Land Rover other topics of discussion are major <b>car companies and automobile related along with mentions of ecommerce sites</b> like ebay and amazon.

##### Most Frequent companies mentioned along with <b>Toyota, Ford and Hyundai</b> in News Titles

In [130]:
# Filter all news titles containing Toyota, Ford or Hyundai in order to find other frequently mentioned companies in the same context
car_titles = news_df[news_df['title'].str.contains('toyota|ford|hyundai', case=False)]

similar_comp_title = spacy_word(car_titles, 'clean_title')

filtered_similar_comp_title = similar_comp_title[~similar_comp_title['Entity'].isin(['toyota', 'ford', 'hyundai'])]
filtered_similar_comp_title.reset_index(drop=True)

Unnamed: 0,Entity,Frequency
0,british columbia carpages.ca,42
1,corolla,38
2,winnipeg manitoba,28
3,tucson,22
4,elantra,21
5,toronto ontario carpages.ca,19
6,brantford ontario carpages.ca,19
7,automotive news,18
8,ford edge,17
9,ford fusion,13


Similarly from the news titles we can see that News titles that talk about Toyota, Ford or Hyundai are mostly talked about in tandem with car models in these companies like <b>Corolla and Camry from Toyota, Edge and Fusion from Ford and Elantra from Hyundai.</b> In addition we can also see that most of these articles talk about or reference the Canadian Carpages.ca site along with mentions of Canadian cities. This makes sense as <b>Carpages.ca is a Canadian online marketplace for buying and selling new and used cars.</b>

##### Most Frequent companies mentioned along with <b>Toyota, Ford and Hyundai</b> in News Article Texts

In [131]:
# Filter all news articles containing Toyota, Ford or Hyundai in order to find other frequently mentioned companies in the same context
car_text= news_df[news_df['text'].str.contains('toyota|ford|hyundai', case=False)]

similar_comp_text= spacy_word(car_text, 'clean_text')

filtered_similar_comp_text = similar_comp_text[~similar_comp_text['Entity'].isin(['toyota', 'ford', 'hyundai'])]
filtered_similar_comp_text.reset_index(drop=True)

Unnamed: 0,Entity,Frequency
0,mailonline,7803
1,britney spears,3928
2,km,3534
3,instagram,3460
4,honda,3423
5,bmw,2961
6,amazon,2806
7,house,2630
8,trump,2343
9,crown,2307


<i>3. Identify most frequent locations of events, by extracting appropriate named entities. Locations may include countries, states, cities, regions, etc.</i>

Using Spacy,

* GPE stands for Geo-Political Entity and typically includes countries, cities, and states or provinces within a country.
* LOC stands for Location and includes non-GPE locations such as bodies of water, mountains, and other natural or man-made geographical features.
* FAC stands for Facility and includes buildings, airports, highways, bridges, etc.

In [132]:
def spacy_word_location(df, col):
    entities_op = []
    for doc in nlp.pipe(
    df[col].tolist(),
    disable=["tok2vec", "tagger", "parser", "attribute _ruler", "lemmatizer"], 
    batch_size=200, 
    n_process=2 ):
        for ent in doc.ents:
            if ent.label_ in ['GPE', 'LOC', 'FAC']:
                entities_op.append(ent.text.lower())

    entities_freqdist = pd.DataFrame(FreqDist(entities_op).most_common(20), columns=['Entity', 'Frequency'])
    return entities_freqdist

In [133]:
spacy_word_location(tweets_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,uk,90
1,india,43
2,russia,26
3,london,16
4,ukraine,13
5,us,12
6,china,12
7,nvidia drive,8
8,australia,7
9,japan,6


In [134]:
spacy_word_location(news_df, 'clean_title')

Unnamed: 0,Entity,Frequency
0,uk,172
1,us,80
2,india,79
3,north york,72
4,u.s.,56
5,taiwan,53
6,china,51
7,manitoba,49
8,alberta,49
9,australia,34


In [135]:
spacy_word_location(news_df, 'clean_text')

Unnamed: 0,Entity,Frequency
0,us,10644
1,uk,10242
2,la,10142
3,los angeles,7886
4,london,7142
5,nyc,5699
6,new york city,5587
7,hollywood,4640
8,new york,4439
9,australia,4266


We can see the top locations mentioned in the tweets and news articles above. Our spaCy model was able to do a great job extracting locations from our tweets and news articles.