## NLP Class Assignment 5

In [52]:
import pandas as pd
import re
import os
import requests
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk, TweetTokenizer
from collections import Counter
import spacy 
from spacy import displacy

import sys

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

#### Read news data

In [None]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

#### Read Tweets data

In [39]:
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


## Twitter Data

### Clean the data

In [42]:
# Filter out retweets
tweets_df = tweets_df[tweets_df['retweeted'] != 'RT']
tweets_df = tweets_df[tweets_df['lang'] == 'en']

def clean_tweet(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'(?:\n)','|', text) ## Remove new lines
    return text.strip()

tweets_df["text_cleaned"] = tweets_df["text"].apply(clean_tweet)

In [43]:
tweets_df.head(5)

Unnamed: 0,id,lang,date,name,retweeted,text,text_cleaned
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt",Body amp Assembly Halewood United Kingdom|53350428352296402m||Halewood Body amp Assembly is a Jaguar Land Rover factory in Halewood England and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site Wikipedia
2,1529341557580652545,en,2022-05-25,Exmoor Trim,,New Land Rover Range Rover Hits Top Speed With Ease On Autobahn\n\nhttps://t.co/19QOgAIu3v,New Land Rover Range Rover Hits Top Speed With Ease On Autobahn||
3,1542790343535755264,en,2022-07-01,Land Rover UK,,"@BeckyWatts1 Hi, thank you for your post. I am sorry to learn of your retailer experience, however when order slots are available they will be presented to the retailer, and it is at their discretion which vehicles they submit. Please continue to liaise with them. Thanks, Dan - Land Rover UK",Hi thank you for your post I am sorry to learn of your retailer experience however when order slots are available they will be presented to the retailer and it is at their discretion which vehicles they submit Please continue to liaise with them Thanks Dan Land Rover UK
4,1516732103370493954,en,2022-04-20,Sussexes❤️👑🐼🌸,,@cofvefe917 @Mackingday No this was during the Land Rover driving challenge.,No this was during the Land Rover driving challenge
8,1545395541416321026,en,2022-07-08,Olu Femi,,Genuinely not a car person but that Land Rover defender is haunting me,Genuinely not a car person but that Land Rover defender is haunting me


### NLTK without Sentence Segmentation

In [47]:
def extract_entities(text):
    entities = []
    labels = []
    for chunk in ne_chunk(pos_tag(word_tokenize(text)), binary=False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk))  # Combine multi-token entities
            labels.append(chunk.label())
    entities_labels = list(set(zip(entities, labels)))  # Unique entities and labels
    return entities_labels

tweets_df['nltk_ner_whole'] = tweets_df['text_cleaned'].apply(extract_entities)

In [48]:
company_mentions = Counter()

for entities_labels in tweets_df['nltk_ner_whole']:
    for entity, label in entities_labels:
        if label == 'ORGANIZATION':  
            company_mentions[entity] += 1

# Get the most common 20 companies
top_20_companies = company_mentions.most_common(20)


In [49]:
# Convert the list to a DataFrame
top_20_nltk_ner_whole = pd.DataFrame(top_20_companies, columns=['Company', 'Mentions'])

# Display the DataFrame
print(top_20_nltk_ner_whole)

                        Company  Mentions
0                    Land Rover       580
1                          Land       370
2                          LAND       132
3                          eBay       107
4                         ROVER        73
5                   Jaguar Land        48
6                         Rover        45
7                           SUV        43
8                           BaT        34
9                         TEKNO        27
10                          BMW        26
11                           UK        23
12                         LIVE        15
13            Jaguar Land Rover        14
14         Land Rover Discovery        13
15                     Mercedes        13
16                         Ford        13
17                     RDynamic        13
18                          JLR        12
19  MHK100800 New Mass Air Flow        11


### NLTK with Sentence Segmentation

In [50]:
def extract_entities(text):
    entities = []
    labels = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
                labels.append(chunk.label())
    entities_labels = list(set(zip(entities, labels)))  # Unique entities and labels
    return entities_labels

tweets_df['nltk_ner_sentence'] = tweets_df['text_cleaned'].apply(extract_entities)

In [23]:
company_mentions = Counter()

for entities_labels in tweets_df['nltk_ner_sentence']:
    for entity, label in entities_labels:
        if label == 'ORGANIZATION':  
            company_mentions[entity] += 1

# Get the most common 20 companies
top_20_companies = company_mentions.most_common(20)

In [24]:
# Convert the list to a DataFrame
top_20_nltk_ner_sentence = pd.DataFrame(top_20_companies, columns=['Company', 'Mentions'])

# Display the DataFrame
print(top_20_nltk_ner_sentence)

                        Company  Mentions
0                    Land Rover       570
1                          Land       371
2                          LAND       134
3                          eBay       112
4                         ROVER        73
5                   Jaguar Land        49
6                         Rover        44
7                           SUV        42
8                           BaT        34
9                         TEKNO        27
10                          BMW        25
11                           UK        24
12         Land Rover Discovery        15
13                         LIVE        15
14                     RDynamic        14
15                     Mercedes        13
16                          JLR        13
17                         Ford        13
18            Jaguar Land Rover        13
19  MHK100800 New Mass Air Flow        11


### SpaCy without Sentence Segmentation

In [54]:
nlp = spacy.load("en_core_web_lg")

In [55]:
def extract_entities_with_spacy(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    entities = []
    labels = []
    
    # Extract entities and their labels
    for ent in doc.ents:
        entities.append(ent.text)
        labels.append(ent.label_)
    
    # Create a list of unique entity-label pairs
    entities_labels = list(set(zip(entities, labels)))
    
    return entities_labels

tweets_df['spacy_ner_whole'] = tweets_df['text_cleaned'].apply(extract_entities_with_spacy)

In [56]:
company_mentions = Counter()

for entities_labels in tweets_df['spacy_ner_whole']:
    for entity, label in entities_labels:
        if label == 'ORG':  
            company_mentions[entity] += 1

# Get the most common 20 companies
top_20_companies = company_mentions.most_common(20)

In [57]:
# Convert the list to a DataFrame
top_20_spacy_ner_whole = pd.DataFrame(top_20_companies, columns=['Company', 'Mentions'])

# Display the DataFrame
print(top_20_spacy_ner_whole)

                   Company  Mentions
0               Land Rover       413
1                     eBay       335
2        Jaguar Land Rover       273
3      Land Rover Defender       100
4                   Jaguar        72
5                    Rover        42
6            TEKNOOFFICIAL        41
7                     Ford        39
8                      BMW        38
9           the Land Rover        33
10                   Volvo        29
11             Range Rover        25
12                  Toyota        25
13                    Audi        24
14                 Porsche        24
15  Land Rover Range Rover        24
16                   Tesla        18
17             Tata Motors        17
18    Land Rover Defenders        17
19                Mercedes        16


### SpaCy with Sentence Segmentation

In [58]:
def extract_entities_with_spacy_sentence_segmentation(text):
    doc = nlp(text)
    
    entities = []
    labels = []
    
    # Iterate over sentences
    for sent in doc.sents:
        # Process each sentence for NER
        for ent in sent.ents:
            entities.append(ent.text)
            labels.append(ent.label_)

    # Create a list of unique entity-label pairs
    entities_labels = list(set(zip(entities, labels)))
    
    return entities_labels

tweets_df['spacy_ner_sentence'] = tweets_df['text_cleaned'].apply(extract_entities_with_spacy)

In [59]:
company_mentions = Counter()

for entities_labels in tweets_df['spacy_ner_sentence']:
    for entity, label in entities_labels:
        if label == 'ORG':  
            company_mentions[entity] += 1

# Get the most common 20 companies
top_20_companies = company_mentions.most_common(20)

In [61]:
# Convert the list to a DataFrame
top_20_spacy_ner_sentence = pd.DataFrame(top_20_companies, columns=['Company', 'Mentions'])

# Display the DataFrame
print(top_20_spacy_ner_sentence)

                   Company  Mentions
0               Land Rover       413
1                     eBay       335
2        Jaguar Land Rover       273
3      Land Rover Defender       100
4                   Jaguar        72
5                    Rover        42
6            TEKNOOFFICIAL        41
7                     Ford        39
8                      BMW        38
9           the Land Rover        33
10                   Volvo        29
11             Range Rover        25
12                  Toyota        25
13                    Audi        24
14                 Porsche        24
15  Land Rover Range Rover        24
16                   Tesla        18
17             Tata Motors        17
18    Land Rover Defenders        17
19                Mercedes        16
