In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext autotime

time: 302 µs


In [3]:
%cd ..

/Users/rubenbroekx/Documents/Projects/twitter-sentiment-classifier/twitter_sentiment_classifier
time: 1.81 ms


# Datasets

This notebook makes the data training ready. It will split the data into `training`, `validation`, and `testing` datasets as well.

**TODO: Split on similarity here?**

In [4]:
import json
import os
import pandas as pd

from store.loader import fetch_all_tweet_data
from tqdm import tqdm
from collections import Counter

time: 381 ms


In [5]:
# Fetch all tweet data from S3 (skips automatically if this is already the case)
fetch_all_tweet_data()

time: 446 µs


In [6]:
with open(os.path.expanduser('store/data/tweets_annotated.jsonl'), 'r') as f:
    annotations = [json.loads(line) for line in f.readlines()]
print(f"Loaded in {len(annotations)} annotations")

Loaded in 51576 annotations
time: 726 ms


## 1. Similarity sorting

Create a sorting within the tweets based on similarity to other tweets, using the MUSE embeddings of the previous notebook to help define similarity. 

A general assumption when training Deep Learning model, for which our Transformer model is no exception, is that the more unique the data samples are, the more they contribute during the model's training. Hence, the tweets with the most unique sentence embeddings are more likely to be used during training.

**Note: This step may take a while (~5min). Only necessary to run if `tweets_sorted.jsonl` does not yet exists!**

In [7]:
# Number of neighbours with which the sentence compares itself (to define similarity score)
N_NEIGHBOURS = 10
assert N_NEIGHBOURS < 512
assert N_NEIGHBOURS > 5  # At least the size of N_ANNOTATORS

time: 515 µs


In [8]:
import tensorflow_hub as hub
import tensorflow_text
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

time: 3.62 s


In [9]:
# Use only the accepted tweets (accept==True) with agreement (agreement==True) for training
annotations_accepted = [a for a in annotations if a['accept'] and a['agreement']]
print(f"Total of {len(annotations_accepted)} usable tweets")

Total of 47096 usable tweets
time: 20.3 ms


In [10]:
# Encode the tweets using the MUSE embeddings (from previous notebook)
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

path_to_embedding = Path(os.path.expanduser('store/muse/'))
if not os.path.exists(path_to_embedding): os.makedirs(path_to_embedding)
already_created = {x.stem for x in path_to_embedding.glob("*.npz")}

def encode(sample):
    """Encode the text."""
    tweet_id = str(sample['id'])
    if tweet_id in already_created: 
        return np.load(path_to_embedding / (tweet_id + '.npz'))['arr_0'][0]
    else:
        embeddings_ar = model(sample['text']).numpy()
        np.savez_compressed(path_to_embedding / tweet_id, embeddings_ar, allow_pickle=True)
        already_created.add(tweet_id)
        return embeddings_ar[0]

time: 5.84 s


In [11]:
embeddings = []
for sample in tqdm(annotations_accepted, desc="Encoding"):
    embeddings.append(encode(sample))
assert len(annotations_accepted) == len(embeddings)

Encoding: 100%|██████████| 47096/47096 [00:25<00:00, 1838.21it/s]

time: 25.6 s





In [None]:
# Create a similarity-matrix on the raw embeddings
similarity = cosine_similarity(embeddings)
print("Created similarity matrix of shape:", similarity.shape)

# Put similarity with oneself to zero
np.fill_diagonal(similarity, 0)

# Constraint the similarity matrix to be between zero and one
similarity = np.clip(similarity, a_min=0, a_max=1)

# Sort the similarity-matrix (row-wise)
print("Sorting the matrix, this may take a while...")
similarity_sorted = np.sort(similarity, axis=1, kind='heapsort')  # O(n*log(n))
print(" --> Done")

In [None]:
# For every embedding, calculate its similarity to its N_NEIGHBOURS closest neighbours
similarity_scores = []
for i in tqdm(range(len(embeddings)), desc='Calculating similarity'):
    similarity_scores.append(sum(similarity_sorted[i, -N_NEIGHBOURS:]))
assert len(similarity_scores) == len(embeddings)

In [None]:
# Bucket the similarity scores and visualise them (fitted on N_NEIGHBOURS==10)
counter = Counter()
for score in similarity_scores:
    counter[round(score, 1)] += 1
    
# Bar plot
values, height = zip(*sorted(counter.items()))
plt.figure(figsize=(15,5))
plt.bar(values, height=height, width=0.08, zorder=2)  # Put on top of regions

# Adjust axis
plt.xlim(0, 10)
plt.ylim(0, 3000)

# highlight regions
plt.axvspan(0, 3, color='green', alpha=0.2)
plt.text(1.5, 2750, 'Highly unique', fontsize=12, horizontalalignment='center')

plt.axvspan(3, 6, color='yellow', alpha=0.2)
plt.text(4.5, 2750, 'Regular', fontsize=12, horizontalalignment='center')

plt.axvspan(6, 8, color='orange', alpha=0.2)
plt.text(7, 2750, 'Mostly similar', fontsize=12, horizontalalignment='center')

plt.axvspan(8, 10, color='red', alpha=0.2)
plt.text(9, 2750, 'Near complete duplicates', fontsize=12, horizontalalignment='center')
plt.show()

In [None]:
# Sort the annotations with the most unique (i.e. those with the lowest score) first
annotations_sorted = [a for _, a in sorted(zip(similarity_scores, annotations_accepted), key=lambda x: x[0])]

In [None]:
# Store the sorted data
with open(os.path.expanduser('store/data/tweets_sorted.jsonl'), 'w') as f:
    f.write('\n'.join([json.dumps(a) for a in annotations_sorted]) + '\n')

## 2. Dataset split

Split the dataset into `training`, `validation` and `test` sets.

In [None]:
from typing import List

In [None]:
def give_overview(samples):
    """Give an overview of the samples."""
    print("Dataset overview:")
    print(f" - Total of {len(samples)} samples")
    
    annotators = [s['annotator'] for s in samples]
    print(" - Annotators:")
    for a in sorted(set(annotators)):
        print(f"   - {a}: {annotators.count(a)}")
        
    labels = [s['label'] for s in samples]
    print(" - Labels:")
    for l in sorted(set(labels)):
        print(f"   - {l}: {labels.count(l)}")

In [None]:
# Load in the sorted tweets
with open(os.path.expanduser('store/data/tweets_sorted.jsonl'), 'r') as f:
    annotations = [json.loads(line) for line in f.readlines()]
print(f"Loaded in {len(annotations)} annotations")

In [None]:
# Put in DataFrame
df = pd.DataFrame(
    annotations
)

# Use tweet-ID as index
df.set_index('id', inplace=True)

# Keep only columns relevant for training
#  Note: All tweets_sorted are accepted (accept==True) and do agree (agreement=True)
df = df[['text', 'label', 'annotator', 'flag']]

# Remove duplicate rows
#  Note: Same tweets (flagged) annotated by multiple annotators are kept 
df.drop_duplicates(inplace=True)

# Give overview of the data
print(f"Total size of {len(df)}")
df.head()

### 2.1. Test

This part of the section creates a testing dataset.

For each sentiment - this being `POSITIVE`, `NEUTRAL`, and `NEGATIVE` - we collect `SIZE_SENTIMENT` samples to create the test dataset.

Select only the most reliable annotators (`TRUSTED_ANNOTATORS`) to be used to create the test-set.

In [None]:
# The total size of the test dataset
TEST_SIZE = 3000

# Number of samples of each sentiment included in the test-set
TEST_SIZE_SENTIMENT = TEST_SIZE // 3

# Select the trusted annotators
TRUSTED_ANNOTATORS = ['H']

In [None]:
# Extract all samples annotated by TRUSTED_ANNOTATORS
df_t = df[df.annotator.isin(TRUSTED_ANNOTATORS)]
print(f"Total of {len(df_t)} samples annotated by", TRUSTED_ANNOTATORS)
df_t.head()

In [None]:
# Sample the first TEST_SIZE_SENTIMENT tweets of each sentiment
def sample_sentiment(sentiment) -> List[int]:
    """Sample TEST_SIZE_SENTIMENT annotated tweet IDs for the given sentiment."""
    # Only consider the samples of the correct sentiment
    df_t_s = df_t[df_t.label == sentiment]
    
    # Sample tweets, don't consider order of tweets (ignore sorting bias)
    return df_t_s.sample(n=TEST_SIZE_SENTIMENT).index

In [None]:
# Sample
idx_positive = sample_sentiment('POSITIVE')
idx_neutral = sample_sentiment('NEUTRAL')
idx_negative = sample_sentiment('NEGATIVE')

# Check; no overlap in IDs possible
idx_test =  set(idx_positive) | set(idx_neutral) | set(idx_negative)
assert len(idx_test) == 3*TEST_SIZE_SENTIMENT 

In [None]:
# Collect all the samples used for testing
annotations_test = []
added_idx = set()
for a in annotations:
    if a['id'] in added_idx: continue
    if a['annotator'] != 'H': continue
    if a['id'] not in idx_test: continue
    added_idx.add(a['id'])
    annotations_test.append(a)
    
assert len(annotations_test) == 3*TEST_SIZE_SENTIMENT
assert len([a for a in annotations_test if a['label'] == 'POSITIVE']) == TEST_SIZE_SENTIMENT
assert len([a for a in annotations_test if a['label'] == 'NEUTRAL']) == TEST_SIZE_SENTIMENT
assert len([a for a in annotations_test if a['label'] == 'NEGATIVE']) == TEST_SIZE_SENTIMENT

In [None]:
# Give an overview of the testing dataset
give_overview(annotations_test)

In [None]:
# Store the test-set
with open(os.path.expanduser('store/data/tweets_test.jsonl'), 'w') as f:
    f.write('\n'.join([json.dumps(a) for a in annotations_test]) + '\n')

## 2.2. Validation and training

Create the validation and training set by splitting up the remaining dataset.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Collect the remaining samples first
already_added = set()
def generate_key(sample):
    """Generate sample key."""
    return f"{sample['id']}-{sample['annotator']}-{sample['label']}"

# Check which already added to test-set
for a in annotations_test:
    already_added.add(generate_key(a))
assert len(already_added) == 3*TEST_SIZE_SENTIMENT

keys_remaining = set()
annotations_remaining = []
for a in annotations:
    if generate_key(a) in already_added: continue  # a | b is a time consuming operation
    if generate_key(a) in keys_remaining: continue
    keys_remaining.add(generate_key(a))
    annotations_remaining.append(a)
assert len(annotations_remaining) == len(keys_remaining)
print("Total samples remaining for train and validation:", len(keys_remaining))

In [None]:
# Count labels (used to balance dataset remainder)
n_pos = sum([s['label'] == 'POSITIVE' for s in annotations_remaining])
print("Number of positive:", n_pos)
n_neu = sum([s['label'] == 'NEUTRAL' for s in annotations_remaining])
print("Number of neutral:", n_neu)
n_neg = sum([s['label'] == 'NEGATIVE' for s in annotations_remaining])
print("Number of negative:", n_neg)

In [None]:
# Enforce the neutral count to be below a certain threshold
MAX_NEUTRAL = 13_500

# Lower the neutral annotations by pruning off the lowest-sorted samples
annotations_pruned = []
neutral_count = 0
for a in annotations_remaining:
    if a['label'] == 'NEUTRAL':
        if neutral_count >= MAX_NEUTRAL: 
            keys_remaining.remove(generate_key(a))
            continue
        neutral_count += 1
    annotations_pruned.append(a)
assert len(annotations_pruned) == len(keys_remaining)

# Replace the current annotations with the pruned version
annotations_remaining = annotations_pruned

In [None]:
# Show updated label count
n_pos = sum([s['label'] == 'POSITIVE' for s in annotations_remaining])
print("Number of positive:", n_pos)
n_neu = sum([s['label'] == 'NEUTRAL' for s in annotations_remaining])
print("Number of neutral:", n_neu)
n_neg = sum([s['label'] == 'NEGATIVE' for s in annotations_remaining])
print("Number of negative:", n_neg)

In [70]:
# Split the keys in a training and validation set (use keys since shuffle==True)
#  Use 4000 for training set and remaining for validation
keys_train, keys_val = train_test_split(list(keys_remaining), test_size=0.03)  # Little over 1000 val samples
assert len(set(keys_train) & set(keys_val)) == 0

# Assign correct (sorted) annotations
annotations_train = []
annotations_val = []
for a in tqdm(annotations_remaining, desc="Assigning split..."):
    if generate_key(a) in keys_train:
        annotations_train.append(a)
    elif generate_key(a) in keys_val:
        annotations_val.append(a)
    else:
        raise Exception(f"Invalid key {generate_key(a)}")
assert len(annotations_train) == len(keys_train)
assert len(annotations_val) == len(keys_val)

print("Total training samples:", len(annotations_train))
print("Total validation samples:", len(annotations_val))

Assigning split...: 100%|██████████| 39921/39921 [00:16<00:00, 2479.03it/s]

Total training samples: 38723
Total validation samples: 1198
time: 16.1 s





In [71]:
# Give an overview of the training dataset
give_overview(annotations_train)

Dataset overview:
 - Total of 38723 samples
 - Annotators:
   - A: 8802
   - H: 13174
   - I: 5610
   - O: 4899
   - R: 1528
   - S: 4710
 - Labels:
   - NEGATIVE: 12912
   - NEUTRAL: 13081
   - POSITIVE: 12730
time: 42.6 ms


In [72]:
# Give an overview of the validation dataset
give_overview(annotations_val)

Dataset overview:
 - Total of 1198 samples
 - Annotators:
   - A: 285
   - H: 374
   - I: 172
   - O: 164
   - R: 53
   - S: 150
 - Labels:
   - NEGATIVE: 395
   - NEUTRAL: 419
   - POSITIVE: 384
time: 2.28 ms


In [73]:
# Store the training-set
with open(os.path.expanduser('store/data/tweets_train.jsonl'), 'w') as f:
    f.write('\n'.join([json.dumps(a) for a in annotations_train]) + '\n')
    
# Store the validation-set
with open(os.path.expanduser('store/data/tweets_val.jsonl'), 'w') as f:
    f.write('\n'.join([json.dumps(a) for a in annotations_val]) + '\n')

time: 486 ms
