In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# create a dataframe from the csv file
from src.config import SENTIMENT_ANNOTATIONS_CSV
df = pd.read_csv(SENTIMENT_ANNOTATIONS_CSV)

## Poking Around the Data

### Basics

In [None]:
# What does it look like?
df.head()

In [None]:
# How many rows and columns?
df.shape

### Distributions

In [None]:
# How much of our data comes from each source?
df["data_source"].value_counts().plot(kind="bar")
plt.show()

In [None]:
# What's the distribution of sentiment?
df["sentiment_output"].value_counts().plot(kind="bar")
plt.show()

In [None]:
# What is the distribution of sentiment for each data_source?
df.groupby("data_source")["sentiment_output"].value_counts().plot(kind="bar")
plt.show()

### Title and Entry Descriptions

In [None]:
# Okay, so Reddit is full of pessimists, no surprise there.
# The app store and gplay are slightly positive, and have similar distributions.
# What percentage of the titles are NaN?
round(df["title"].isna().sum() / df.shape[0], 2)

In [None]:
# About 1/4, so probably could use the title to help with sentiment analysis.
# Are there any NaNs in the entry column?
df["entry"].isna().sum()

In [None]:
# No, that's good.
# But what is the average length, standard deviation, min, and max of the entry column?
df["entry"].str.len().describe()

In [None]:
# Let's import a tokenizer to see the breakdown by tokens instead of characters.
# We'll use the spacy tokenizer, which is a good default for English.
import spacy

nlp = spacy.load("en_core_web_sm")

df["entry_doc"] = df["entry"].apply(lambda x: nlp(x))  # convert to spacy doc
df["entry_token_count"] = df["entry_doc"].apply(lambda x: len(x))  # get token count
df["entry_token_count"].describe()  # get stats

In [None]:
# We'll have to watch out for that max value, but otherwise the length looks good
# to pass into a transformer model like BERT since the max length is usually 512.

## Baseline Accuracy

In [None]:
# How accurate is the baseline sentiment analysis model?
df["correct?"].sum() / df.shape[0]

In [None]:
# 0.69 isn't bad, but let's get a little more insight into the errors.
from sklearn.metrics import classification_report

print(classification_report(df["sentiment_output"], df["annotated_sentiment"]))

In [None]:
# We can see that the model is struggling with the neutral class.
# Let's take a look at some incorrect predictions for the neutral class.
df_neutral = df[df["sentiment_output"] == "NEUTRAL"]
df_neutral_inc = df_neutral[df_neutral["annotated_sentiment"] != "NEUTRAL"]
df_neutral_inc.head()

In [None]:
# So those are all incorrectly labeled as negative.
# What's the distribution of predicted sentiment for the incorrect neutral class?
df_neutral_inc["annotated_sentiment"].value_counts().plot(kind="bar")
plt.show()

In [None]:
# Okay, so in the incorrect cases, the model is mostly predicting negative.
# Let's take a look at some of the incorrect negative predictions.
for i in range(5):
    print(f"{i+1}. {df_neutral_inc.iloc[i]['entry']}")
    print(f"Predicted: {df_neutral_inc.iloc[i]['annotated_sentiment']}")
    print()

In [None]:
# In my opinion, the model is doing a pretty good job of predicting negative sentiment
# because some of these are subjectively negative.
# Anyways, let's benchmark performance against a few other models in the next notebook.