# Sentiment Labeling Exploration

Getting to know sentiment labels. Especially to check consistency of labels.

In [1]:
import os, sys, re, string, pickle, random
sys.path.append("..")
from config import credentials
import dropbox


import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
random_state = 23
random.seed(random_state)

## Loading

In [3]:
team_dbx = dropbox.DropboxTeam(credentials.dropbox_team_access_token)
team_root = team_dbx.with_path_root(dropbox.common.PathRoot.namespace_id(
    credentials.dropbox_team_namespace_id))
user_dbx = team_root.as_user(credentials.dropbox_team_member_id)

data_path = "/Data/CSVData"
fpath = os.path.join(data_path, "TestData", "forSentAnalysis.csv")

_, res = user_dbx.files_download(fpath)
labeled_data = pd.read_csv(res.raw)
labeled_data.shape

(1342, 7)

Rating schema: {0: "positive", 10: "neutral", 20: "negative", 30: "offensive", -2: "notAssessable"}

## Preprocessing

Dataset cleansing

In [4]:
# Remove unnecessary cols and rename them uniformly
labeled_data.drop(["id", "Level", "Topic", "sentiment"], axis=1, inplace=True)
labeled_data.columns = ["source", "text", "rating"]

# Remove not assessable
labeled_data = labeled_data[labeled_data.rating != -2]

# Remove empty texts
labeled_data.text.replace("", np.nan, inplace=True)
labeled_data.dropna(subset=["text"], inplace=True)
labeled_data = labeled_data[~labeled_data.text.str.isspace()]

# Remove duplicated texts
labeled_data.drop_duplicates(subset=["text"], keep="first", inplace=True)

# Remap rating labels
new_rating_schema = {0: 0, 10: 0, 20: 1, 30: 1}
labeled_data.rating = labeled_data.rating.map(new_rating_schema)

labeled_data.shape

(1053, 3)

New rating schema: {0: "non-negative", 1: "negative"}

In [5]:
labeled_data.rating.value_counts()

0    546
1    507
Name: rating, dtype: int64

### Vectorizer

In [6]:
from spacy_tokenize import spacy_tokenizer, plain_tokenizer

count_vect = CountVectorizer(tokenizer=plain_tokenizer)

X = count_vect.fit_transform(labeled_data.text)

In [7]:
tfidf = TfidfVectorizer(tokenizer=plain_tokenizer)
tfidf.fit(labeled_data.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function plain_tokenizer at 0x7fa3252a0290>,
                use_idf=True, vocabulary=None)

## Evaluation

### Clustering

In [8]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

labeled_data["kmeans"] = kmeans.predict(X)

In [9]:
confusion_matrix(labeled_data.rating, labeled_data.kmeans)

array([[538,   8],
       [496,  11]])

In [10]:
print(classification_report(labeled_data.rating, labeled_data.kmeans))

              precision    recall  f1-score   support

           0       0.52      0.99      0.68       546
           1       0.58      0.02      0.04       507

    accuracy                           0.52      1053
   macro avg       0.55      0.50      0.36      1053
weighted avg       0.55      0.52      0.37      1053



### Topic Modeling

In [11]:
lda = LatentDirichletAllocation(n_components=2, random_state=random_state)

topic_distr = lda.fit_transform(X)

labeled_data["topic"] = topic_distr.argmax(axis=1)

In [12]:
confusion_matrix(labeled_data.rating, labeled_data.topic)

array([[200, 346],
       [229, 278]])

In [13]:
print(classification_report(labeled_data.rating, labeled_data.topic))

              precision    recall  f1-score   support

           0       0.47      0.37      0.41       546
           1       0.45      0.55      0.49       507

    accuracy                           0.45      1053
   macro avg       0.46      0.46      0.45      1053
weighted avg       0.46      0.45      0.45      1053



### Similarity

In [14]:
X_nonnegative = labeled_data[labeled_data.rating == 0]
X_negative = labeled_data[labeled_data.rating == 1]

tfidf_nonnegative = tfidf.transform(X_nonnegative.text)
tfidf_negative = tfidf.transform(X_negative.text)

In [15]:
cos_similarity = np.dot(tfidf_negative, tfidf_nonnegative.T).A
avg_sim = np.diag(cos_similarity).mean()

cos_similarity_neg = np.dot(tfidf_negative, tfidf_negative.T).A
cos_similarity_nonneg = np.dot(tfidf_nonnegative, tfidf_nonnegative.T).A

neg_avg_sim = np.tril(cos_similarity_neg, -1).mean()
nonneg_avg_sim = np.tril(cos_similarity_nonneg, -1).mean()

print("Avg sim of all posts:", avg_sim)
print("Avg sim negative posts:", neg_avg_sim)
print("Avg sim non-negative posts:", nonneg_avg_sim)

Avg sim of all posts: 0.032144036401833546
Avg sim negative posts: 0.018353174984894714
Avg sim non-negative posts: 0.015136336853201173
