# Language model validation with testing set

In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re

import emoji
import datetime

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


from textblob import TextBlob

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tuomasp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tuomasp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tuomasp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/tuomasp/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
punctuations = string.punctuation

# Create a list of stopwords
stop_words = stopwords.words('english')

# Create a function to clean the tweets
def clean_tweets(tweet):
    # Remove usernames
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)
    # Remove emojis 
    tweet = emoji.demojize(tweet)
    # Tokenize the tweet by words
    tweet_tokens = word_tokenize(tweet)
    # Remove stopwords
    tweet_no_stopwords = [word.lower() for word in tweet_tokens if word.lower() not in stop_words]
    # Remove punctuation
    tweets_no_punc = [''.join([char for char in word if char not in punctuations]) for word in tweet_no_stopwords]
    # Remove empty strings
    tweets_no_punc = [word for word in tweets_no_punc if word != '']
    # Remove links
    tweets_no_punc = [word for word in tweets_no_punc if not 'http' in word]
    # Remove words with one character
    tweets_no_punc = [word for word in tweets_no_punc if len(word) > 1]

    return tweets_no_punc

# Create a function to lemmatize the tweets
def lemmatize_tweets(tweet):

    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # Lemmatize the tweet
    tweet_lemmatized = [lemmatizer.lemmatize(word) for word in tweet]
    # Return the list of lemmatized words
    return tweet_lemmatized

# Create a function to get the polarity of the tweets
def get_polarity(tweet):
    # Create a TextBlob object
    analysis = TextBlob(tweet)
    # Return the polarity
    return analysis.sentiment.polarity

# Create a function to get the subjectivity of the tweets
def get_subjectivity(tweet):
    # Create a TextBlob object
    analysis = TextBlob(tweet)
    # Return the subjectivity
    return analysis.sentiment.subjectivity

vader_analyzer = SentimentIntensityAnalyzer()

def sentiment_decider(compound):
    polarity = "neutral"

    if(compound >= 0.05):
        polarity = "positive"

    elif(compound <= -0.05):
        polarity = "negative"

    return polarity


def predict_polarity_vader(text):
    output_dict =  vader_analyzer.polarity_scores(text)
    return output_dict['compound']

def predict_sentiment_vader(text):
    output_dict =  vader_analyzer.polarity_scores(text)
    return sentiment_decider(output_dict['compound'])

## Language model validation

In [3]:
# validation dataset preparation
sentiment_dict = {'1': 'negative', '3': 'neutral', '5': 'positive'}
df_validation_apple = pd.read_csv('Apple-Twitter-Sentiment-DFE.csv', encoding = 'unicode_escape')
df_validation_apple = df_validation_apple.loc[df_validation_apple['sentiment:confidence'] == 1][['sentiment', 'text']]
df_validation_apple = df_validation_apple[df_validation_apple['sentiment'].isin(sentiment_dict.keys())]
df_validation_apple['sentiment'] = df_validation_apple['sentiment'].apply(lambda x: sentiment_dict[x])

# Clean the tweets
df_validation_apple['text_clean'] = df_validation_apple['text'].apply(lambda x: str(clean_tweets(x)))
df_validation_apple['text'] = df_validation_apple['text'].apply(lambda x: re.sub(r'@[A-Za-z0-9_]+', '', x))

# Lemmatize them
df_validation_apple['text_clean'] = df_validation_apple['text_clean'].apply(lambda x: lemmatize_tweets(eval(x)))

# TextBlob polarity evaluation
df_validation_apple['tb_polarity'] = df_validation_apple['text_clean'].apply(lambda x: get_polarity(str(x)))
df_validation_apple['tb_sentiment'] = df_validation_apple['tb_polarity'].apply(lambda x: sentiment_decider(x))

# VADER polarity evaluation
df_validation_apple['vader_polarity'] = df_validation_apple['text'].apply(lambda x: predict_polarity_vader(x))
df_validation_apple['vader_sentiment'] = df_validation_apple['text'].apply(lambda x: predict_sentiment_vader(x))

df_validation_apple['tb_valid_sentiment'] = df_validation_apple['tb_sentiment'] == df_validation_apple['sentiment']
df_validation_apple['vader_valid_sentiment'] = df_validation_apple['vader_sentiment'] == df_validation_apple['sentiment']

# Print the first 5 rows of the cleaned tweets
df_validation_apple.head()

Unnamed: 0,sentiment,text,text_clean,tb_polarity,tb_sentiment,vader_polarity,vader_sentiment,tb_valid_sentiment,vader_valid_sentiment
2,neutral,My cat only chews cords. Such an #AppleSnob.,"[cat, chew, cord, applesnob]",0.0,neutral,0.0,neutral,True,True
10,negative,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,"[wtf, battery, 31, one, second, ago, 29, wtf]",-0.333333,negative,-0.8769,negative,True,True
13,positive,RT : Bought my at the store..pretty good log...,"[rt, bought, store, pretty, good, logo, match,...",0.475,positive,0.4926,positive,True,True
14,negative,Contact sync between Yosemite and iOS8 is ser...,"[contact, sync, yosemite, ios8, seriously, scr...",-0.127778,negative,-0.3415,negative,True,True
19,neutral,Why #AAPL Stock Had a Mini-Flash Crash Today: ...,"[aapl, stock, miniflash, crash, today, money, ...",0.0,neutral,-0.4019,negative,True,False


In [4]:
# statistics over whole dataset
vader_right_sentiments = len(df_validation_apple[df_validation_apple['vader_valid_sentiment'] == 1])
tb_right_sentiments = len(df_validation_apple[df_validation_apple['tb_valid_sentiment'] == 1])
total_sentiments = len(df_validation_apple)
vader_perc = vader_right_sentiments / total_sentiments * 100
tb_perc = tb_right_sentiments / total_sentiments * 100

# statistics for confusion matrix (notation: model_TrueLabel_PredictedLabel)
# VADER
df_apple_pos = df_validation_apple[df_validation_apple['sentiment'] == 'positive']
vader_Pos_Pos = len(df_apple_pos[df_apple_pos['vader_sentiment'] == 'positive'])
vader_Pos_Neu = len(df_apple_pos[df_apple_pos['vader_sentiment'] == 'neutral'])
vader_Pos_Neg = len(df_apple_pos[df_apple_pos['vader_sentiment'] == 'negative'])

df_apple_neu = df_validation_apple[df_validation_apple['sentiment'] == 'neutral']
vader_Neu_Pos = len(df_apple_neu[df_apple_neu['vader_sentiment'] == 'positive'])
vader_Neu_Neu = len(df_apple_neu[df_apple_neu['vader_sentiment'] == 'neutral'])
vader_Neu_Neg = len(df_apple_neu[df_apple_neu['vader_sentiment'] == 'negative'])

df_apple_neg = df_validation_apple[df_validation_apple['sentiment'] == 'negative']
vader_Neg_Pos = len(df_apple_neg[df_apple_neg['vader_sentiment'] == 'positive'])
vader_Neg_Neu = len(df_apple_neg[df_apple_neg['vader_sentiment'] == 'neutral'])
vader_Neg_Neg = len(df_apple_neg[df_apple_neg['vader_sentiment'] == 'negative'])

# TextBlob
tb_Pos_Pos = len(df_apple_pos[df_apple_pos['tb_sentiment'] == 'positive'])
tb_Pos_Neu = len(df_apple_pos[df_apple_pos['tb_sentiment'] == 'neutral'])
tb_Pos_Neg = len(df_apple_pos[df_apple_pos['tb_sentiment'] == 'negative'])

tb_Neu_Pos = len(df_apple_neu[df_apple_neu['tb_sentiment'] == 'positive'])
tb_Neu_Neu = len(df_apple_neu[df_apple_neu['tb_sentiment'] == 'neutral'])
tb_Neu_Neg = len(df_apple_neu[df_apple_neu['tb_sentiment'] == 'negative'])

tb_Neg_Pos = len(df_apple_neg[df_apple_neg['tb_sentiment'] == 'positive'])
tb_Neg_Neu = len(df_apple_neg[df_apple_neg['tb_sentiment'] == 'neutral'])
tb_Neg_Neg = len(df_apple_neg[df_apple_neg['tb_sentiment'] == 'negative'])

print("For the total amount of right predictions using polarity thresholds of -0.05 and 0.05, VADER predicted correctly {:d} ({:.2f}%) and TextBlob {:d} ({:.2f}%) out of {:d} total samples.\n".format(vader_right_sentiments, vader_perc, tb_right_sentiments, tb_perc, total_sentiments))

print("For the true positive labels, VADER predicted {:d} as positive, {:d} as neutral and {:d} as negative.".format(vader_Pos_Pos, vader_Pos_Neu, vader_Pos_Neg))
print("For the true neutral labels, VADER predicted {:d} as positive, {:d} as neutral and {:d} as negative.".format(vader_Neu_Pos, vader_Neu_Neu, vader_Neu_Neg))
print("For the true negative labels, VADER predicted {:d} as positive, {:d} as neutral and {:d} as negative.\n".format(vader_Neg_Pos, vader_Neg_Neu, vader_Neg_Neg))

print("For the true positive labels, TextBlob predicted {:d} as positive, {:d} as neutral and {:d} as negative.".format(tb_Pos_Pos, tb_Pos_Neu, tb_Pos_Neg))
print("For the true neutral labels, TextBlob predicted {:d} as positive, {:d} as neutral and {:d} as negative.".format(tb_Neu_Pos, tb_Neu_Neu, tb_Neu_Neg))
print("For the true negative labels, TextBlob predicted {:d} as positive, {:d} as neutral and {:d} as negative.".format(tb_Neg_Pos, tb_Neg_Neu, tb_Neg_Neg))

For the total amount of right predictions using polarity thresholds of -0.05 and 0.05, VADER predicted correctly 1176 (62.35%) and TextBlob 1113 (59.01%) out of 1886 total samples.

For the true positive labels, VADER predicted 137 as positive, 18 as neutral and 4 as negative.
For the true neutral labels, VADER predicted 285 as positive, 602 as neutral and 137 as negative.
For the true negative labels, VADER predicted 137 as positive, 129 as neutral and 437 as negative.

For the true positive labels, TextBlob predicted 117 as positive, 39 as neutral and 3 as negative.
For the true neutral labels, TextBlob predicted 262 as positive, 682 as neutral and 80 as negative.
For the true negative labels, TextBlob predicted 134 as positive, 255 as neutral and 314 as negative.
