This file processes the provided data `tweets.csv`. It is downloaded from https://dataverse.harvard.edu/dataset.xhtml?id=3047332.

In [5]:
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer

In [7]:
tweets_df = pd.read_csv("data/tweets.csv")
tweets_df

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.196330e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.191010e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.190140e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.190120e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.186890e+17,en,,,17620,4655
...,...,...,...,...,...,...,...,...,...,...
52537,ddlovato,Life couldn't be better right now. 😊,,06/01/2015 23:10,5.526030e+17,en,,,32799,23796
52538,ddlovato,First Monday back in action. I'd say 21.6 mile...,,06/01/2015 02:17,5.522880e+17,en,,,21709,12511
52539,ddlovato,"Crime shows, buddy, snuggles = the perfect Sun...",,05/01/2015 03:42,5.519470e+17,en,,,25269,15583
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,,05/01/2015 00:06,5.518920e+17,und,,,15985,10456


In [8]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [9]:
tweets = list(tweets_df['content'][:100])

In [10]:
inputs = tokenizer(tweets, return_tensors="pt", padding=True, truncation=True)
labels = torch.tensor(len(tweets_df['author'][:100])).unsqueeze(0)

In [11]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
# input_ids = torch.tensor([tokenizer.encode(tweets[0])])

In [12]:
with torch.no_grad():
    features = bertweet(inputs.input_ids)  # Models outputs are now tuples

In [13]:
features.pooler_output.shape

torch.Size([100, 768])

## Create Dataset and DataLoader

In [20]:
class TweetDataset(Dataset):
    def __init__(self, df, size=100):
        
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
        self.bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

        self.inputs = tokenizer(list(df['content'][:size]), return_tensors="pt", padding=True, truncation=True)
        self.authors = pd.get_dummies(df['author']).loc[:size].values
        
        # Run BERT forward pass to get embeddings
        with torch.no_grad():
            features = bertweet(self.inputs.input_ids)
            
        self.embeddings = features.pooler_output
        self.labels = torch.tensor(self.authors).float()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        x = self.embeddings[idx]
        y = self.labels[idx]
        
        return x, y

In [21]:
ds = TweetDataset(tweets_df, size=10)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [32]:
ds[0][0].shape[0] == 768
ds[0][1].shape[0] == 20

IndexError: tuple index out of range