Prototype stopped: bad performance

# Sentiment Analysis Prototyping

Prototype code for rule-based sentiment analysis.

In [1]:
import os, sys, re, string
sys.path.append("..")
from config import credentials
import dropbox

import numpy as np
import pandas as pd

import nltk
nltk.data.path.append("../data/external/nltk_data")
from nltk import word_tokenize

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

## Loading

In [2]:
team_dbx = dropbox.DropboxTeam(credentials.dropbox_team_access_token)
team_root = team_dbx.with_path_root(dropbox.common.PathRoot.namespace_id(
    credentials.dropbox_team_namespace_id))
user_dbx = team_root.as_user(credentials.dropbox_team_member_id)

In [2]:
data_path = "/Data/CSVData"
test_fpath = os.path.join(data_path, "TestData", "forSentAnalysis.csv")

_, res = user_dbx.files_download(fpath)
test_data = pd.read_csv(res.raw)
test_data.shape

(1056, 7)

In [3]:
sentiws_path = "../data/external/SentiWS_v2.0"
positive_fpath = os.path.join(sentiws_path, "SentiWS_v2.0_Positive.txt")
negative_fpath = os.path.join(sentiws_path, "SentiWS_v2.0_Negative.txt")

# POSITIVE words
positive = pd.read_csv(positive_fpath, sep="\t", names=["word_pos", "polarity", "inflections"])
positive[["word", "pos"]] = positive.word_pos.str.split("|", expand=True)
positive = positive[["word", "polarity", "pos", "inflections"]]
pos_total = positive.word.count() + positive.inflections.str.split(",").dropna().apply(lambda x: len(x)).sum()
print("POS total (incl. inflections):", pos_total)

# NEGATIVE words
negative = pd.read_csv(negative_fpath, sep="\t", names=["word_pos", "polarity", "inflections"])
negative[["word", "pos"]] = negative.word_pos.str.split("|", expand=True)
negative = negative[["word", "polarity", "pos", "inflections"]]
neg_total = negative.word.count() + negative.inflections.str.split(",").dropna().apply(lambda x: len(x)).sum()
print("NEG total (incl. inflections):", neg_total)

POS total (incl. inflections): 16716
NEG total (incl. inflections): 18217


## Preprocessing

SentiWS lexicon

In [4]:
def make_lexicon(polarity_df, lexicon):
    """ Makes lexicon of pos/neg words with corresponding polarity score. 
    Util func: Appends words and inflections with 
    corresponding polarity value to lexicon dict.
    """
    for _, row in polarity_df.iterrows():
        lexicon[row["word"].lower()] = row["polarity"]
        if row["inflections"] is not np.nan:
            words = row["inflections"].split(",")
            for word in words:
                lexicon[word.lower()] = row["polarity"]
    return lexicon

lexicon = {}
lexicon = make_lexicon(positive, make_lexicon(negative, lexicon))

Test cleansing and tokenization

In [5]:
def clean_text(text):
    """ Util: Cleans text string.
    > Lowercase string
    > Replace game scores with "GAME_SCORE" placeholder
    > Punctuation removal
    > Replace numbers with "NUM" placeholder
    """
    lowercased = text.lower()
    scores_removed = re.sub(r"(\d+) ?(-|:) ?(\d+)", "GAME_SCORE ", lowercased)
    punctuations = string.punctuation + "„" + "”"
    punct_removed = scores_removed.translate(str.maketrans("", "",
                                                           punctuations))
    num_replaced = re.sub(r"\b\d+\b", "NUM", punct_removed)

    return num_replaced

test_data["cleaned_txt"] = test_data.text.apply(clean_text)

In [6]:
test_data["tokens"] = (test_data.cleaned_txt.apply(word_tokenize))

Filter samples with pos/neu/neg sentiment only

In [7]:
ratings_dict = {0: "positive", 10: "neutral", 20: "negative", 30: "offensive", -2: "notAssessable"}

test_data["rating"] = test_data.replace({"Rating": ratings_dict}).Rating.astype(str)

test_subset = test_data.loc[(test_data.rating == "positive") |
                            (test_data.rating == "neutral") |
                            (test_data.rating == "negative")]
test_subset = test_subset.copy()
test_subset.shape

(845, 10)

## Sentiment Assignment

Rule-based approach, using SentiWS v2.0.

### Policy: Summing polarity scores

In [8]:
def assign_sentiment_polar(tokens, lexicon):
    """ Computes sentiment score by summing polarity scores """
    score = 0
    
    for word in tokens:
        if word in lexicon.keys():
            score += lexicon[word]
            
    return score

test_subset["sentiment_score_polar"] = test_subset.tokens.apply(lambda x: assign_sentiment_polar(x, lexicon))

#### Classification
Policy: 0 -> neutral (& kW); < 0 -> negative; > 0 -> positive

In [9]:
test_subset["sentiment_polar"] = test_subset.sentiment_score_polar.apply(lambda x: "negative" if x < 0
                                                                         else "positive" if x > 0 
                                                                         else "neutral")

### Policy: Frequency of pos/neg words

In [10]:
def assign_sentiment_freq(tokens, lexicon):
    """ Computes sentiment score by summing-up count of pos/neg words """
    neg, pos = 0, 0
    
    for word in tokens:
        if word in lexicon.keys():
            if lexicon[word] > 0:
                pos += 1
            else:
                neg += 1
    
    if pos > neg:
        return "positive"
    elif neg > pos:
        return "negative"
    else:
        return "neutral"

#### Classification
Policy: same count or 0 -> neutral; more pos -> positive; more neg -> negative

In [11]:
test_subset["sentiment_freq"] = test_subset.tokens.apply(lambda x: assign_sentiment_freq(x, lexicon))

## Evaluation

In [12]:
labels = ratings_dict = ["positive", "neutral", "negative"]

#### Confusion Matrix

Policy: Summing polarity scores

In [13]:
cm_polar = confusion_matrix(test_subset.rating, test_subset.sentiment_polar, labels=labels)

cm_df_polar = pd.DataFrame(cm_polar, columns=labels, index=labels)
cm_df_polar.index.name = "True"
cm_df_polar

Unnamed: 0_level_0,positive,neutral,negative
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
positive,85,19,34
neutral,118,28,165
negative,89,31,276


Policy: Frequency of pos/neg words

In [14]:
cm_freq = confusion_matrix(test_subset.rating, test_subset.sentiment_freq, labels=labels)

cm_df_freq = pd.DataFrame(cm_freq, columns=labels, index=labels)
cm_df_freq.index.name = "True"
cm_df_freq

Unnamed: 0_level_0,positive,neutral,negative
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
positive,96,23,19
neutral,144,69,98
negative,123,93,180


#### Classification Report

Policy: Summing polarity scores

In [15]:
print(classification_report(test_subset.rating, test_subset.sentiment_polar, labels=labels))

              precision    recall  f1-score   support

    positive       0.29      0.62      0.40       138
     neutral       0.36      0.09      0.14       311
    negative       0.58      0.70      0.63       396

    accuracy                           0.46       845
   macro avg       0.41      0.47      0.39       845
weighted avg       0.45      0.46      0.41       845



Policy: Frequency of pos/neg words

In [16]:
print(classification_report(test_subset.rating, test_subset.sentiment_freq, labels=labels))

              precision    recall  f1-score   support

    positive       0.26      0.70      0.38       138
     neutral       0.37      0.22      0.28       311
    negative       0.61      0.45      0.52       396

    accuracy                           0.41       845
   macro avg       0.41      0.46      0.39       845
weighted avg       0.46      0.41      0.41       845



# Conclusion

Both implemented rule-based approaches perform bad. The one with the policy of summing-up polarity scores from SentiWS performs slightly better, esp. at the task of classifying negative sentiment. Nonetheless both approaches shouldn't be used in produtc