In [1]:
import nltk
import pandas as pd
from sklearn.metrics import classification_report, f1_score, accuracy_score
import numpy as np
import re

In [2]:
import flair
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

2021-12-11 18:36:46,899 loading file /home/sundesh/.flair/models/sentiment-en-mix-distillbert_4.pt


In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [4]:
from textblob import TextBlob

In [5]:
df = pd.read_csv('./gold_output.csv')

In [6]:
df['scores'] = df['tweet'].apply(lambda review: sid.polarity_scores(review))

In [7]:
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])

In [8]:
df['comp_score'] = df['compound'].apply(lambda c: 'POSITIVE' if c >=0 else 'NEGATIVE')

In [9]:
df['scores_text_blob'] = df['tweet'].apply(lambda review: TextBlob(review).sentiment.polarity)
df['comp_score_text_blob'] = df['scores_text_blob'].apply(lambda c: 'POSITIVE' if c >=0 else 'NEGATIVE')

In [10]:
def comp_score_flair(text):
    s = flair.data.Sentence(text)
    flair_sentiment.predict(s)
    total_sentiment = s.labels
    return str(total_sentiment[0]).split(' ')[0]

def score_flair(text):
    s = flair.data.Sentence(text)
    flair_sentiment.predict(s)
    total_sentiment = s.labels
    temp = str(total_sentiment[0]).split(' ')[0]
    if temp == "POSITIVE":
        return float(str(total_sentiment[0]).split(' ')[1][1:-1])
    else:
        return -float(str(total_sentiment[0]).split(' ')[1][1:-1])

df['score_flair'] = df['tweet'].apply(score_flair)    
df['comp_score_flair'] = df['tweet'].apply(comp_score_flair)

In [11]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,created_at,tweet,sum,true_value,polarity_api_output,is_same,concept_api_output,intensity_api_output,aspect_api_output,scores,compound,comp_score,scores_text_blob,comp_score_text_blob,score_flair,comp_score_flair
0,0,1369812663623180288,2021-03-11 6:19:23,In great news my senior parents are getting th...,2.0,POSITIVE,POSITIVE,1,"['great news', 'senior', 'vaccine', 'thank', '...",0.769,"[('getting', 'POSITIVE'), ('whew', 'POSITIVE')]","{'neg': 0.0, 'neu': 0.575, 'pos': 0.425, 'comp...",0.8625,POSITIVE,0.8,POSITIVE,0.9949,POSITIVE
1,1,1354514776534220804,2021-01-28 1:11:03,Our Co-Founder and CEO <USER> recently receive...,2.0,POSITIVE,NEGATIVE,0,"['founder', 'recent', 'covid', 'vaccine', 'vac...",-0.821,"[('vaccine', 'POSITIVE'), ('must', 'POSITIVE')...","{'neg': 0.0, 'neu': 0.725, 'pos': 0.275, 'comp...",0.923,POSITIVE,0.516667,POSITIVE,0.7248,POSITIVE
2,2,1349045265945264130,2021-01-12 22:57:10,"""This is all we have left. This can't fail."" <...",0.2,POSITIVE,POSITIVE,1,"['fail', 'make', 'vaccine', 'priority', 'plus'...",0.814,"[('mass', 'POSITIVE'), ('ANNOYANCE', 'POSITIVE...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,POSITIVE,-0.25,NEGATIVE,-0.9779,NEGATIVE
3,3,1366793773728980992,2021-03-02 22:23:24,Dear #MAGA Trump NEVER cared about you. He Jim...,-1.2,NEGATIVE,POSITIVE,0,"['cared', 'stop', 'punch', 'plane', 'traitor',...",0.769,"[('drink', 'NEGATIVE'), ('plan', 'POSITIVE')]","{'neg': 0.183, 'neu': 0.719, 'pos': 0.098, 'co...",-0.3013,NEGATIVE,0.0,POSITIVE,-0.999,NEGATIVE
4,4,1371004616897269763,2021-03-14 13:15:47,I am all for risk triaging and in turn getting...,-1.1,NEGATIVE,POSITIVE,0,"['health care', 'risk', 'turn', 'vaccine', 'va...",0.769,"[('getting', 'POSITIVE'), ('more', 'POSITIVE')...","{'neg': 0.042, 'neu': 0.841, 'pos': 0.117, 'co...",0.5789,POSITIVE,0.291667,POSITIVE,-0.921,NEGATIVE


In [12]:
y_true = df['true_value'].to_numpy()
y_pred_vader = df['comp_score'].to_numpy()
y_pred_sentic = df['polarity_api_output'].to_numpy()
y_pred_text_blob = df['comp_score_text_blob'].to_numpy()
y_pred_flair = df['comp_score_flair'].to_numpy()

In [13]:
def calc_label(y):
    x = y.copy()
    x[x=="POSITIVE"] = int(1)
    x[x=="NEGATIVE"] = int(-1)
    return x.astype('int64')

In [14]:
def calc_ensemble(y1, y2, y3):
    x1, x2, x3 = y1.copy(), y2.copy(), y3.copy()
    ans = np.zeros(y1.shape)
#     x1 = (x1-np.mean(x1))/(np.std(x1))
#     x2 = (x2-np.mean(x2))/(np.std(x2))
#     x3 = (x3-np.mean(x3))/(np.std(x3))
    x1 = 0.8*x1
    x2 = 0.0*x2
    x3 = 0.0*x3
    ans[(x1+x2+x3)>=0] = 1
    ans[(x1+x2+x3)<0] = -1
    return ans.astype('int64')

In [15]:
print("F1:", f1_score(calc_label(y_true), calc_label(y_pred_sentic)), 
      "Acc:", accuracy_score(calc_label(y_true), calc_label(y_pred_sentic)))

F1: 0.7541478129713424 Acc: 0.674


In [16]:
print("F1:", f1_score(calc_label(y_true), calc_label(y_pred_vader)),
     "Acc:", accuracy_score(calc_label(y_true), calc_label(y_pred_vader)))

F1: 0.8634482758620691 Acc: 0.802


In [17]:
print("F1:", f1_score(calc_label(y_true), calc_label(y_pred_text_blob)),
     "Acc:", accuracy_score(calc_label(y_true), calc_label(y_pred_text_blob)))

F1: 0.8054794520547945 Acc: 0.716


In [18]:
print("F1:", f1_score(calc_label(y_true), calc_label(y_pred_flair)),
     "Acc:", accuracy_score(calc_label(y_true), calc_label(y_pred_flair)))

F1: 0.7897435897435898 Acc: 0.754


In [19]:
f1_score(calc_label(y_true), calc_label(y_pred_flair))

0.7897435897435898

In [20]:
print(f1_score(calc_label(y_true), calc_ensemble(df['compound'].to_numpy(), 
                                                              df['scores_text_blob'].to_numpy(), 
                                                              df['score_flair'].to_numpy())))
print(accuracy_score(calc_label(y_true), calc_ensemble(df['compound'].to_numpy(), 
                                                              df['scores_text_blob'].to_numpy(), 
                                                              df['score_flair'].to_numpy())))

0.8634482758620691
0.802
