# Using NLP for sentiment analysis
See https://huggingface.co/blog/sentiment-analysis-python
Also see https://ieeexplore.ieee.org/document/8848203


## Uing OPT as a model

In [28]:
from transformers import OPTConfig, OPTModel, AutoTokenizer, OPTForSequenceClassification
 
# Initializing a OPT facebook/opt-large style configuration
configuration = OPTConfig()

# Initializing a model (with random weights) from the facebook/opt-large style configuration
model = OPTForSequenceClassification(configuration)

# Accessing the model configuration
configuration = model.config

# Initializing a OPT tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

In [29]:
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Sentiment analysis pipeline
analyzer = pipeline("sentiment-analysis", model= model , tokenizer = tokenizer)



In [30]:
import pandas as pd

media_data = pd.read_csv("../test_data/media.csv", index_col = 0, parse_dates = ['pub_date'])

In [31]:
## Apply the sentiment analysis pipeline to the abstract of each article in the media data

# initialize a list to store the sentiment scores
sentiment_scores = []

# loop through each article
for abstract in media_data['abstract']:
    # apply the sentiment analysis pipeline to the abstract
    sentiment_scores.append(analyzer(abstract)[0].get('score'))
    
# add the sentiment scores to the media data
media_data['OPT_sentiment_score-abstract'] = sentiment_scores



### Note:
nlp_sentiment_score returns a sentiment score between 0 and 1 while textblob returns a sentiment score between -1 and 1.

In [32]:
# ## change the name of the column to 'sentiment_score' to 'nlp_sentiment_score'
# media_data.rename(columns = {'nlp_sentiment_score':'OPT_sentiment_score'}, inplace = True)

# ## display random 5 rows of the media data
# media_data.sample(5)

In [33]:
## looking at the articles with the highest and lowest nlp_sentiment_scores
max_score = media_data['OPT_sentiment_score-abstract'].max()
min_score = media_data['OPT_sentiment_score-abstract'].min()

## display the articles with the highest and lowest nlp_sentiment_scores
articles_with_max_score = media_data[media_data['OPT_sentiment_score-abstract'] == max_score]
articles_with_min_score = media_data[media_data['OPT_sentiment_score-abstract'] == min_score]

print("Article with the highest sentiment score was: \n'{}' with score {}, \n and the lowest sentiment score was: \n'{}' with score {}".format(articles_with_max_score['headline.main'].values[0], max_score, articles_with_min_score['headline.main'].values[0], min_score))

Article with the highest sentiment score was: 
'Change These Default Settings and Be Happier With Your Tech' with score 0.8934900164604187, 
 and the lowest sentiment score was: 
'Text Messaging Is Cool. But Where Are Its Boundaries?' with score 0.5016201734542847


In [34]:
## Look at the full abstract of the article with the highest sentiment score
articles_with_max_score['abstract'].values[0]

'Simple steps can make your smartphone, computer and TV work better for you.'

In [35]:
articles_with_min_score['abstract'].values[0]

'Apple and Google have added useful features to texting apps, yet the apps still lack a major component: an effective way to set limits.'

In [36]:
## Let us now apply the OPT model to the lead_paragraph of each article in the media data

# initialize a list to store the sentiment scores
sentiment_scores = []

# loop through each article
for lead_paragraph in media_data['lead_paragraph']:
    # apply the sentiment analysis pipeline to the abstract
    sentiment_scores.append(analyzer(lead_paragraph)[0].get('score'))

# add the sentiment scores to the media data
media_data['OPT_sentiment_score-lead_paragraph'] = sentiment_scores

# display random 5 rows of the media data

media_data.sample(5)


Unnamed: 0,pub_date,abstract,lead_paragraph,snippet,headline.main,text,Polarity,Sentiment,OPT_sentiment_score-abstract,OPT_sentiment_score-lead_paragraph
71,2022-03-23 22:11:21+00:00,The test program is likely to whittle away at ...,Google said on Wednesday that it would allow s...,The test program is likely to whittle away at ...,"Google, facing antitrust scrutiny, says it wil...",The test program is likely to whittle away at ...,-0.721876,0.153309,0.714149,0.759162
57,2022-05-11 21:56:52+00:00,Artificial intelligence is being woven into an...,SAN FRANCISCO — There was a time when Google o...,Artificial intelligence is being woven into an...,Google Offers a More Modest Vision of the Future,Artificial intelligence is being woven into an...,-0.423378,0.553346,0.868511,0.716421
19,2022-09-01 11:54:05+00:00,"Apple, Google and Microsoft are among the tech...","This month, Apple is expected to release the i...","Apple, Google and Microsoft are among the tech...",Big Tech Reconsiders the “Made in China” Way,"Apple, Google and Microsoft are among the tech...",0.964186,0.461493,0.526054,0.695505
15,2022-09-15 09:00:26+00:00,Data mining plus streaming can target politica...,"Over the last few weeks, tens of thousands of ...",Data mining plus streaming can target politica...,This Ad’s for You (Not Your Neighbor),Data mining plus streaming can target politica...,0.581998,0.482124,0.679978,0.557534
50,2022-06-07 14:40:15+00:00,"Soon, iPhone owners will be able to edit text ...","CUPERTINO, Calif. — Around this time every yea...","Soon, iPhone owners will be able to edit text ...",How Updates in iOS 16 and Android 13 Will Chan...,"Soon, iPhone owners will be able to edit text ...",0.292976,0.243523,0.70122,0.708729


In [37]:
## find the articles with the highest and lowest OPT_sentiment_score-lead_paragraph scores
articles_with_max_score_lead_paragraph = media_data[media_data['OPT_sentiment_score-lead_paragraph'] == 
                                                    media_data['OPT_sentiment_score-lead_paragraph'].max()]
articles_with_min_score_lead_paragraph = media_data[media_data['OPT_sentiment_score-lead_paragraph'] == 
                                                    media_data['OPT_sentiment_score-lead_paragraph'].min()]

print(articles_with_max_score_lead_paragraph['headline.main'].values[0])
print(articles_with_min_score_lead_paragraph['headline.main'].values[0])

Recession? Not for Big Tech.
Google Says Trump’s Truth Social Must Scrub Violent Content to Join Play Store


In [38]:
## making a new column called 'all_article-data' by concatenating the abstract, lead_paragraph columns, headline.main, text
media_data['all_article_data'] = 'The abstact is: ' + media_data['abstract']+ '\n' + 'The lead paragraph is: ' + media_data['lead_paragraph'] + '\n' + 'The Headline is: ' + media_data['headline.main'] + '\n' + 'First part of text is: ' + media_data['text']

In [39]:
print(media_data['all_article_data'][0])

## in a string to go to the next line, we use '\n'

The abstact is: Google’s parent company reported earnings that were below analysts’ expectations, bringing in $13.9 billion in profit on $69.1 billion in sales.
The lead paragraph is: Even Alphabet, the parent company of Google and one of the internet’s most entrenched businesses, was not immune to the punishing blows of the global economy and a sharp slowdown in the online advertising that is essential to the company’s profits.
The Headline is: Alphabet’s Profit Drops 27 Percent From a Year Earlier
First part of text is: Google’s parent company reported earnings that were below analysts’ expectations, bringing in $13.9 billion in profit on $69.1 billion in sales. Even Alphabet, the parent company of Google and one of the internet’s most entrenched businesses, was not immune to the punishing blows of the global economy and a sharp slowdown in the online advertising that is essential to the company’s profits. Google’s parent company reported earnings that were below analysts’ expectatio

In [40]:
## Apply the sentiment analysis pipeline to the all_article_data of each article in the media data
sentiment_scores = []

# loop through each article
for all_article_data in media_data['all_article_data']:
    # apply the sentiment analysis pipeline to the abstract
    sentiment_scores.append(analyzer(all_article_data)[0].get('score'))
    
# add the sentiment scores to the media data
media_data['OPT_sentiment_score-all_article_data'] = sentiment_scores


In [41]:
media_data.describe()

Unnamed: 0,Polarity,Sentiment,OPT_sentiment_score-abstract,OPT_sentiment_score-lead_paragraph,OPT_sentiment_score-all_article_data
count,100.0,100.0,100.0,100.0,100.0
mean,-0.032761,0.540686,0.70401,0.699364,0.653988
std,0.581867,0.286348,0.093552,0.098762,0.0919
min,-0.972187,0.007054,0.50162,0.50704,0.500882
25%,-0.569664,0.296075,0.636666,0.617925,0.562124
50%,-0.013981,0.578345,0.713512,0.707012,0.664656
75%,0.467404,0.789954,0.767595,0.782186,0.722852
max,0.966517,0.998986,0.89349,0.873937,0.878638
