In [None]:
#!pip install GoogleNews
import GoogleNews
from GoogleNews import GoogleNews
import pandas as pd
import datetime
import requests
!pip install newspaper3k
import newspaper # library to extract text

In [4]:
def extract_article_text(url):
    try:
        article = newspaper.Article(url=url)
        article.download()
        article.parse()
        return article.text
    except:
        return 'URL cannot be opened or read'

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, AutoTokenizer, AutoModel
from transformers import AutoModelForTokenClassification, AutoModelForSeq2SeqLM

def get_org(doc):

  # Specify bert-base-ner model for both tokenizer and model
  ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
  ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

  nlp = pipeline('ner', model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

  res=[]
  if not doc:
    return
  d=nlp(doc)
  for entity in d:
    if entity['entity_group']=='ORG':
      org=entity['word'].strip().lstrip()
      if org not in res and org!='.':
        res.append(org)
  return res

In [None]:
!pip install vaderSentiment
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_sentiment_label(sentiment_scores):
    
    compound_score = sentiment_scores['compound']
    if compound_score >= 0.52:
        return 'Positive'
    elif compound_score <= -0.48:
        return 'Negative'
    else:
        return 'Neutral'

def add_sentiment_analysis(df, text_column='article_text', sentiment_column='sentiment'):

    analyzer = SentimentIntensityAnalyzer()

    sentiment_scores = df[text_column].apply(analyzer.polarity_scores)  # Apply analyzer directly
    df[sentiment_column] = sentiment_scores
    df['sentiment_label'] = df[sentiment_column].apply(get_sentiment_label)  # Assign labels

    return df

In [None]:
gn = GoogleNews(lang='en', region='US', encode='utf-8')

# input query & duration here

query = 'Tesla Inc' # company name
start_date='01/07/2024'
end_date='01/08/2024'

output_file_name=f"{query}_{start_date}_{end_date}.xlsx"

datetime_list=pd.date_range(start_date,end_date,freq='d').to_list()
date_list=[d.strftime('%m/%d/%Y') for d in datetime_list]
n_date=len(date_list)

fail_count=0

for i in range(n_date-1):
    
    gn.set_time_range(start_date,end_date)
    gn.get_news(query)
    # Limit the number of retrieved news articles to 50
    results = gn.results(sort=True)[:50] 

# Add "http://" to each link
df=pd.DataFrame(results)
df['link'] = 'http://' + df['link'] 

# Add a column for extracted text
df['article_text'] = df['link'].apply(extract_article_text)

# Add a column to apply the NER function
df['extracted organizations'] = df['article_text'].apply(lambda text: get_org(text))

# Add sentiment analysis with sentiment labels
df = add_sentiment_analysis(df.copy())

In [38]:
# This is to remove duplicate or similar news
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

def get_vector(sentence):
    if not sentence:
        sentences=['']
    else:
        sentences=[sentence]
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings[0]

def get_similarity(df):
    # calculate the max similarity with earlier news
    length=len(df)
    unique=[]
    news2datetime=dict(zip(df.index,df.datetime))
    if length<=1:
        return
    else:
        for i in range(1,length):
            max_sim=0
            for j in range (0,i):
                if abs(news2datetime[i]-news2datetime[j]).days<14:
                    curr_sim=cosine_similarity(df.loc[i]['vector'].reshape(1,-1),df.loc[j]['vector'].reshape(1,-1))[0][0]
                    max_sim=max(max_sim,curr_sim)
                    # below code for manual check
#                     if curr_sim>0.9:
#                         print(i, j)
#                         print(df.loc[i]['article_text'])
#                         print('====')
#                         print(df.loc[j]['article_text'])
            if max_sim<0.9:

                unique.append(i)
    df2=df.loc[unique]
    df2.reset_index(inplace=True)
    return df2

df.dropna(subset='article_text',inplace=True)
df = df[df['article_text'] != 'URL cannot be opened or read']
df.sort_values(by='date', ascending=True, inplace=True)
df.reset_index(inplace=True)
df['vector']=df['article_text'].apply(lambda x: get_vector(x))
# deduplicate
df2=get_similarity(df)