In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv('../data/raw/task1_test_participant.tsv', sep='\t')

In [4]:
df.sample(5)

Unnamed: 0,tweet_id,user_id,text,created_at
9588,408479886580252672,1115191412,@orourkelindsay @TC_Big_Pappi stop being so bo...,2013-12-05
24330,413119304805015552,46816774,Dumb ass auto correct,2013-12-18
20094,419997640890150913,1025811572,@briineeee aren't you fucking with her phone p...,2014-01-06
10920,804078781669527552,1143892999,I HATE for somebody to think they know my life...,2016-11-30
25508,415838183268433921,286332078,Controlling the wide swings of extreme emotion...,2013-12-25


In [5]:
df = df[['text', 'tweet_id']]

In [6]:
import os
os.getcwd()

'/home/zqxh49/Development/phd/meds-classifier/notebooks'

In [7]:
os.chdir('/home/zqxh49/Development/phd/meds-classifier/')
from model.model import BERTGRUModel

In [8]:
model = torch.load('saved/models/train/model.bin')
model.eval()
model = model.to(device)

In [9]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
max_seq_length = 128
class MedsDataset(Dataset):
    """Medications mention dataset"""

    def __init__(self, data, num_lines=None):
        """
        Args:
            data (string): Path to the file containg texts with labels.
        """
#         self.df = pd.read_csv(data_path, header=None)
        self.data = data
        self.max_seq_length = max_seq_length
        if num_lines:
            self.data = self.data[:num_lines]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
            Output:
            text: 
            attention_mask: 
            tweet_id: 
        """
        tweet_id = self.data.iloc[idx, 1]
        text = self.data.iloc[idx, 0]
        tokens = tokenizer.encode(text, add_special_tokens=True)
        padded = np.array(tokens + [0]*(max_seq_length-len(tokens)))
        text = torch.tensor(padded)
        attention_mask = np.where(padded != 0, 1, 0)
        attention_mask = torch.tensor(attention_mask)
        return text, attention_mask, tweet_id

In [11]:
text_dataset = MedsDataset(df)

In [12]:
text_dataset[0]

(tensor([  101,  2026,  3566,  7906,  9343,  2033,  1996, 10140,  3367,  3336,
          4253,   999,   999,   100,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [13]:
batch_size = 16
num_workers = 3
data = DataLoader(
        text_dataset,
        batch_size=batch_size,
        num_workers=num_workers)

In [24]:
from tqdm import tqdm

y_pred = {}
with tqdm(unit_scale=0, unit='lines', total=len(text_dataset)) as t:
    for i, (text, attention_mask, tweet_id) in enumerate(data):
        t.update(len(text))
        with torch.no_grad():
            input_ids, input_mask = text.to(device), attention_mask.to(device)
            output = model(input_ids, input_mask)
            
            # Move predictions to CPU and store
            predictions = output.argmax(dim=1).detach().cpu().tolist()
            tweet_ids = tweet_id.tolist()
            
            for id, pred in zip(tweet_ids, predictions):
                y_pred[id] = pred

100%|██████████| 29687/29687 [04:40<00:00, 105.83lines/s]


In [48]:
df1 = pd.DataFrame(y_pred.items())

In [60]:
df1.columns = ['tweet_id', 'Class']

In [62]:
df1['Class'].value_counts()

0    29636
1       51
Name: Class, dtype: int64

In [63]:
positive_df = df1[df1['Class'] == 1]

In [64]:
positive_df.sample(5)

Unnamed: 0,tweet_id,Class
25064,807121516722876416,1
7626,694689193801322496,1
10787,745425826221559808,1
14946,783340601550868480,1
25494,805810644322779136,1


In [65]:
df[df['tweet_id'].isin(positive_df['tweet_id'].tolist())]

Unnamed: 0,text,tweet_id
541,"@rowdyinc speedy recovery, but enjoy the paink...",708262486152237060
2020,Can't remember if I took my prenatal vitamin t...,398890313646755840
3013,My fetus is already doing this really cute kic...,391092659948756993
3240,@_GetYoFREAKOn_ no problem bae lmao I'm too mu...,439937558328250368
3358,I hate the fact that I can no longer drink any...,394779348190101504
7626,Why do you feel that? I guarantee you Type 1 d...,694689193801322496
7651,The thing I miss most about pre-pregnancy life...,373196416295780352
9186,Brill day.... F&amp;B's for breaky then the ga...,635515148170579968
9467,@KendalBrielle They been giving me Percocets a...,827257137860530178
9557,"Me: josh, how are you still alive? How do you ...",879831979474251776


In [66]:
df1.to_csv('prediction_task1.tsv',sep='\t',index=False)