In [1]:
import re
import csv
import nltk
import numpy as np
import pandas as pd
from PIL import Image
from dateutil import parser
from bs4 import BeautifulSoup
from datetime import datetime
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('vader_lexicon')
nltk.download('words')
words = set(nltk.corpus.words.words())

sentimentAnalyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\eurydice\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\eurydice\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
df = pd.read_csv('../input/hm-sentiment-jan-1-2021-march-31-2022/hm_jan1_2021_to_aug31_2021.csv')
df2 = pd.read_csv('../input/hm-sentiment-jan-1-2021-march-31-2022/hm_sept1_2021_to_mar25_2022 2 copy.csv')
df3 = pd.read_csv('../input/hm-sentiment-jan-1-2021-march-31-2022/hm_mar26_2022_to_mar31_2022.csv')
final_df = df.append(df2, ignore_index=True).append(df3, ignore_index=True).sort_values("created_at").drop_duplicates().reset_index()[['tweet_id', 'created_at', "text", "author_id"]][:-1]

In [4]:
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9_]+","",tweet) #Remove @'s
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    
    #remove most emoticons and symbols
    regex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    tweet = re.sub(regex_pattern,"",tweet)
    
    #convert html to text. removes html encoding, e.g. &amp, &quot, etc...
    del_amp = BeautifulSoup(tweet, 'lxml')
    tweet = del_amp.get_text()
    

    # remove contractions
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"would't", "would not", tweet)
    tweet = re.sub(r"can't", "can not", tweet)
    
    tweet = re.sub(r"\’d", " would",tweet)
    tweet = re.sub(r"n\’t", " not", tweet)
    tweet = re.sub(r"\’re", " are", tweet)
    tweet = re.sub(r"\’s", " is",tweet)
    tweet = re.sub(r"\’ll", " will", tweet)
    tweet = re.sub(r"\’t", " not", tweet)
    tweet = re.sub(r"\’ve", " have", tweet)
    tweet = re.sub(r"\’m", " am", tweet)
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet)
         if w.lower() in words or not w.isalpha())
    return tweet

In [5]:
tweet_list, positive_list, neutral_list, negative_list = [], [], [], []
final_df['clean_text'] = final_df['text'].apply(cleaner)

In [7]:
def sentiment_category_compound(sentiment):
    label = ''
    if(sentiment > 0):
        label = 'positive'
    elif(sentiment == 0):
        label = 'neutral'
    else:
        label = 'negative'
    return(label)

def sentiment_category(sentiment_values):
    label = ''
    neg, neu, pos = sentiment_values['neg'], sentiment_values['neu'], sentiment_values['pos']
    max_sentiment = max(max(neg, neu), pos)
    
    if(max_sentiment == pos):
        if max_sentiment == neg: #in the case that pos = 0.5 and neg = 0.5 or all 3 0.33
            label = "neutral"
        else:
            label = 'positive'
    elif(max_sentiment == neu):
        label = 'neutral'
    else:
        label = 'negative'
    return(label)

In [None]:
print(sentiment_category({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}))
print(sentiment_category_compound(0.0))

In [None]:

print(sentiment_category({'neg': 0.0, 'neu': 0.292, 'pos': 0.708, 'compound': 0.9029}))
print(sentiment_category_compound(0.9029))

In [None]:
#Compound can denote a tweet to have positive or negative sentiment although the neutral sentiment value might be higher than either positive or negative sentiment. Both are worth testing
print(sentiment_category({'neg': 0.121, 'neu': 0.804, 'pos': 0.075, 'compound': -0.1469}))
print(sentiment_category_compound(-0.1469))

In [8]:
%%time
list1, list2 = [], []
for i in final_df['clean_text']:
    sentiment_values = sentimentAnalyzer.polarity_scores(i)
    list1.append(sentiment_category(sentiment_values))
    list2.append(sentiment_category_compound(sentiment_values['compound']))

        
final_df['sentiment_category']  = list1
final_df['sentiment_category_compound'] = list2

In [10]:
final_df['created_at2'] = final_df['created_at'].apply(lambda x: datetime.strptime(x[:10], "%Y-%m-%d"))
neg = final_df[final_df['sentiment_category']=='negative']
neg = neg.groupby(['created_at2'],as_index=False).count()

pos = final_df[final_df['sentiment_category']=='positive']
pos = pos.groupby(['created_at2'],as_index=False).count()

pos = pos[['created_at2','tweet_id']]
neg = neg[['created_at2','tweet_id']]

In [12]:
neg2 = final_df[final_df['sentiment_category_compound']=='negative']
neg2 = neg.groupby(['created_at2'],as_index=False).count()

pos2 = final_df[final_df['sentiment_category_compound']=='positive']
pos2 = pos.groupby(['created_at2'],as_index=False).count()

pos2 = pos2[['created_at2','tweet_id']]
neg2 = neg2[['created_at2','tweet_id']]

In [19]:
neg2

In [11]:
fig = go.Figure()
for col in pos.columns:
    fig.add_trace(go.Scatter(x=pos['created_at2'], y=pos['tweet_id'],
                             name = col,
                             mode = 'markers+lines',
                             line=dict(shape='linear'),
                             connectgaps=True,
                             line_color='green'
                             )
                 )
for col in neg.columns:
    fig.add_trace(go.Scatter(x=neg['created_at2'], y=neg['tweet_id'],
                             name = col,
                             mode = 'markers+lines',
                             line=dict(shape='linear'),
                             connectgaps=True,
                             line_color='red'
                             )
                 )
Afig.show()

April 2021 Events

https://www.bloomberg.com/news/articles/2021-04-04/h-m-s-troubles-in-asia-compound-over-problematic-map-of-region

Nov 2021 Events


March 2022 Events


In [16]:
pos2

In [15]:
fig = go.Figure()
for col in pos2.columns:
    fig.add_trace(go.Scatter(x=pos2['created_at2'], y=pos2['tweet_id'],
                             name = col,
                             mode = 'markers+lines',
                             line=dict(shape='linear'),
                             connectgaps=True,
                             line_color='green'
                             )
                 )
for col in neg2.columns:
    fig.add_trace(go.Scatter(x=neg2['created_at2'], y=neg2['tweet_id'],
                             name = col,
                             mode = 'markers+lines',
                             line=dict(shape='linear'),
                             connectgaps=True,
                             line_color='red'
                             )
                 )
fig.show()

In [None]:
count = 0
for i in final_df['clean_text']:
    if count < 50:
        print(i)
        count += 1