## NLP Class Assignment 5

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [12]:
#!pip install pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 8
INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


#### Read news data

In [2]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/nlp_a_5_news.json'
news_df = pd.read_json(news_path, orient='records', lines=True)

print(f'Sample contains {news_df.shape[0]:,.0f} news articles')
news_df.head(2)

Sample contains 10,012 news articles


Unnamed: 0,url,date,language,title,text
0,http://kokomoperspective.com/obituaries/jon-w-horton/article_b6ba8e1e-cb9c-11eb-9868-fb11b88b9778.html,2021-06-13,en,Jon W. Horton | Obituaries | kokomoperspective.com,Jon W. Horton | Obituaries | kokomoperspective.comYou have permission to edit this article. EditCloseSign Up Log In Dashboard LogoutMy Account Dashboard Profile Saved items LogoutCOVID-19Click here for the latest local news on COVID-19HomeAbout UsContact UsNewsLocalOpinionPoliticsNationalStateAgricultureLifestylesEngagements/Anniversaries/WeddingsAutosEntertainmentHealthHomesOutdoorsSportsNFLNCAAVitalsObituariesAutomotivee-EditionCouponsGalleries74°...
1,https://auto.economictimes.indiatimes.com/news/auto-components/birla-precision-to-ramp-up-capacity-to-tap-emerging-opportunities-in-india/81254902,2021-02-28,en,"Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto","Birla Precision to ramp up capacity to tap emerging opportunities in India, Auto News, ET Auto We have updated our terms and conditions and privacy policy Click ""Continue"" to accept and continue with ET AutoAccept the updated privacy & cookie policyDear user, ET Auto privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy & our cookie ..."


In [22]:
news_df.shape

(10009, 6)

In [23]:
news_df['text'].nunique()

9982

In [24]:
news_df = news_df.drop_duplicates(subset=['text']).reset_index(drop=True)
news_df.shape

(9982, 6)

#### Read Tweets data

In [3]:
tweets_path = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/nlp_a_5_tweets.json'
tweets_df = pd.read_json(tweets_path, orient='records', lines=True)
print(f'Sample contains {tweets_df.shape[0]:,.0f} tweets')
tweets_df.head(2)

Sample contains 10,105 tweets


Unnamed: 0,id,lang,date,name,retweeted,text
0,1534565117614084096,en,2022-06-08,Low Orbit Tourist 🌍📷,,"Body &amp; Assembly - Halewood - United Kingdom\n🌍53.3504,-2.8352296,402m\n\nHalewood Body &amp; Assembly is a Jaguar Land Rover factory in Halewood, England, and forms the major part of the Halewood complex which is shared with Ford who manufacture transmissions at the site. [Wikipedia] https://t.co/LPmCnZIaVt"
1,1534565743429394439,en,2022-06-08,CompleteCar.ie,RT,"Land Rover Ireland has announced that the new Range Rover Sport starts at €114,150, now on @completecar:\n\nhttps://t.co/TjGUkL3FYr https://t.co/QdVaEiJkjO"


In [7]:
tweets_df.shape

(10105, 6)

In [4]:
tweets_df.retweeted.value_counts()

      5094
RT    5011
Name: retweeted, dtype: int64

In [5]:
tweets_df['text'].nunique()

6696

In [8]:
# Removing retweets from this analysis
tweets_df=tweets_df[tweets_df.retweeted!='RT']
tweets_df = tweets_df.drop_duplicates(subset=['text']).reset_index(drop=True)
tweets_df.shape

(4957, 6)

In [16]:
from langdetect import detect

def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

    
news_df = news_df[news_df['text'].parallel_apply(is_english)]
tweets_df = tweets_df[tweets_df['text'].parallel_apply(is_english)]

In [18]:
# Define a function to clean the text data
def clean_text(text, type):
    # Remove URLs, mentions, hashtags and newline characters
    text = re.sub(r'http\S+|www.\S+|@\S+|\n', '', text)
    text = re.sub(r'#\w+', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    if type == "tweet" :
        # Use TweetTokenizer to tokenize
        tweet_tokenizer = nltk.tokenize.TweetTokenizer()
        tokens = tweet_tokenizer.tokenize(text)
    else:
        tokens = nltk.word_tokenize(text)
    
    # Remove single-character tokens (mostly punctuation)
    tokens = [token for token in tokens if len(token) > 1]

    # Remove numbers
    tokens = [token for token in tokens if not token.isnumeric()]

    # Remove punctuation
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stop words and lemmatize the words
    stop_words = set(nltk.corpus.stopwords.words('english'))   
    
    wnl = nltk.WordNetLemmatizer()
    tokens=[wnl.lemmatize(token) for token in tokens if not token in stop_words]
    
    # Join the tokens back into a string
    clean_text = ' '.join(tokens)
    return clean_text


# Apply the clean_text function
news_df['clean_text'] = news_df['text'].parallel_apply(lambda x: clean_text(x, "text"))

tweets_df['clean_text'] = tweets_df['text'].parallel_apply(lambda x: clean_text(x, "tweet"))


In [19]:
tweets_df.shape

(4541, 7)

In [25]:
news_df.shape

(9982, 6)

In [None]:
# Extract entities from tweets and news articles
def extract_entities(df):
    entities = []
    for i, row in df.iterrows():
        text = row['clean_text']
        tokens = preprocess_text(text)
        doc = nlp(' '.join(tokens))
        for ent in doc.ents:
            if ent.label_ == 'ORG' or ent.label_ == 'COMPANY':
                entities.append(ent.text)
    return entities

tweet_entities = extract_entities(tweets_df)
news_entities = extract_entities(news_df)

# Count the frequency of each extracted entity
from collections import Counter
tweet_entity_counts = Counter(tweet_entities)
news_entity_counts = Counter(news_entities)

# Compare the frequency of the extracted entities across tweets and news articles
company_name = None
max_count = 0
for entity, count in tweet_entity_counts.items():
    if news_entity_counts[entity] > 0 and count + news_entity_counts[entity] > max_count:
        company_name = entity
        max_count = count + news_entity_counts[entity]

print('Company name:', company_name)