This was run on Google Colab with a GPU. It shows a more state-of-the-art approach to this problem.

In [5]:
import torch

In [6]:
torch.cuda.is_available()

True

In [7]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving tweets.csv to tweets.csv
User uploaded file "tweets.csv" with length 7823952 bytes


In [8]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.1-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 48.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 45.6 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [9]:
import math

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer

In [10]:
tweets_df = pd.read_csv("tweets.csv")


In [11]:
tweets_df

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.196330e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.191010e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.190140e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.190120e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.186890e+17,en,,,17620,4655
...,...,...,...,...,...,...,...,...,...,...
52537,ddlovato,Life couldn't be better right now. 😊,,06/01/2015 23:10,5.526030e+17,en,,,32799,23796
52538,ddlovato,First Monday back in action. I'd say 21.6 mile...,,06/01/2015 02:17,5.522880e+17,en,,,21709,12511
52539,ddlovato,"Crime shows, buddy, snuggles = the perfect Sun...",,05/01/2015 03:42,5.519470e+17,en,,,25269,15583
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,,05/01/2015 00:06,5.518920e+17,und,,,15985,10456


In [12]:
unique_people = tweets_df['author'].unique()
print(unique_people)
NUM_CLASSES = len(unique_people)

# assign each person a number
id_to_person = {i: unique_people[i] for i in range(len(unique_people))}
person_to_id = {v:k for k,v in id_to_person.items()}

# create a column of author ids
tweets_df['author_id'] = tweets_df['author'].apply(lambda x: person_to_id[x])

['katyperry' 'justinbieber' 'taylorswift13' 'BarackObama' 'rihanna'
 'YouTube' 'ladygaga' 'TheEllenShow' 'Twitter' 'jtimberlake'
 'KimKardashian' 'britneyspears' 'Cristiano' 'selenagomez' 'cnnbrk'
 'jimmyfallon' 'ArianaGrande' 'shakira' 'instagram' 'ddlovato']


In [13]:
!pip install simpletransformers
!pip install tensorboardx

Collecting simpletransformers
  Downloading simpletransformers-0.62.0-py3-none-any.whl (230 kB)
[K     |████████████████████████████████| 230 kB 5.4 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.12.2-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 35.5 MB/s 
Collecting streamlit
  Downloading streamlit-0.89.0-py2.py3-none-any.whl (8.3 MB)
[K     |████████████████████████████████| 8.3 MB 39.9 MB/s 
Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 24.3 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.6 MB/s 
Collecting configparser>=3.8.1
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting patht

Collecting tensorboardx
  Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 5.5 MB/s eta 0:00:01
Installing collected packages: tensorboardx
Successfully installed tensorboardx-2.4


In [14]:
from simpletransformers.classification import ClassificationModel


In [27]:
# We only do 2 epochs. More will result in a better accuracy.
model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, num_labels=NUM_CLASSES, args={
    'train_batch_size': 64,
    'num_train_epochs': 2,
    'max_seq_length': 128,
    'learning_rate': 2e-5,
})

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [22]:

np.random.seed(7)

# first column should be tweet, second column should be label
df_full = tweets_df[['content', 'author_id']].sample(frac=1.0)

n_train = 10000
n_test = 10000

df_train = df_full[:n_train]
df_test = df_full[n_train:n_train+n_test]


In [47]:
model.train_model(df_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/157 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


(157, 1.7726426823124004)

In [48]:
raw_outputs_train = model.eval_model(df_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

In [49]:
# train acc
(np.argmax(raw_outputs_train[1], axis=1) == df_train['author_id'].values).mean()

0.6135

In [50]:
raw_outputs_test = model.eval_model(df_test)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

In [51]:
# test acc
(np.argmax(raw_outputs_test[1], axis=1) == df_test['author_id'].values).mean()

0.5797