In [12]:
import os
import sys

# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'GoEmotions-pytorch') )
sys.path.append(os.path.join(git_dir, 'lib', 'utils') )

# Define data path
data_path = os.path.join(git_dir, 'data', 'pm-transcripts')

# Define data output path
data_output_path = os.path.join(git_dir, 'data', 'pm-transcripts-processed')

In [6]:
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline
import re
from tqdm import tqdm
import numpy as np 
import pandas as pd
import pickle

### Build model

In [3]:
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = model.to('cuda:0')

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)

### Data

In [4]:
from xml_cleaner import get_transcript_fname_by_id, parse_transcript

In [13]:
# Make dataframe from index
ts_path = os.path.join(data_path, 'transcripts') # path folder of where pm-transcripts are stored
index_file_path = os.path.join(data_path, 'index.csv')
index_df = pd.read_csv(index_file_path)

# Make output dir
if not os.path.exists(data_output_path):
    os.makedirs(data_output_path)

### Iterate

In [28]:
ts_id

12219

In [35]:
len(ts['sentences'][-4].split())

635

In [40]:
len(tokenizer(ts['sentences'][-4])['input_ids'])

Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors


868

In [None]:
ts['emotions'] = [goemotions(x)[0] for x in ts['sentences']]

In [41]:
# Choose PM
pm_name = 'Howard, John'

# Get all IDs
ts_ids = list(index_df[index_df['pm']==pm_name]['id'].astype(int))

# Iterate
for i, ts_id in enumerate(tqdm(ts_ids)):
    f_out = os.path.join(data_output_path, str(ts_id)+'.pkl')
    if not os.path.exists(f_out):
        ts = parse_transcript(get_transcript_fname_by_id(ts_path, ts_id))
        if ts is not None:
            ts['emotions'] = []
            for sentence_ind, sentence in enumerate(ts['sentences']):
                tokens = tokenizer(sentence)
                if len(tokens['input_ids'])>500:
                    print('Sentence %d in document %d is too long'%(ts_id, sentence_ind))
                else:
                    ts['emotions'].append(goemotions(sentence)[0])

            with open(f_out, 'wb') as f:
                pickle.dump(ts, f)



  soup = bs(xml_file)
Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors
 24%|██▍       | 1409/5865 [00:11<00:36, 121.44it/s]

Sentence 12219 in document 71 is too long


 69%|██████▊   | 4022/5865 [1:33:23<28:11,  1.09it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (895 > 512). Running this sequence through the model will result in indexing errors
 69%|██████▊   | 4023/5865 [1:33:38<2:32:33,  4.97s/it]

Sentence 22364 in document 31 is too long


 89%|████████▊ | 5192/5865 [2:14:33<38:23,  3.42s/it]  Token indices sequence length is longer than the specified maximum sequence length for this model (1344 > 512). Running this sequence through the model will result in indexing errors
 89%|████████▊ | 5193/5865 [2:14:54<1:35:58,  8.57s/it]

Sentence 11294 in document 0 is too long


 94%|█████████▍| 5500/5865 [2:25:03<17:14,  2.83s/it]  

Sentence 21168 in document 19 is too long


100%|██████████| 5865/5865 [2:36:58<00:00,  1.61s/it]


### Repeat for all remaining

In [25]:
# Get all IDs
index_df_tmp = index_df.copy().dropna(subset=['id'])

ts_ids = list(index_df_tmp['id'].astype(int))

# Iterate
for i, ts_id in enumerate(tqdm(ts_ids)):
    f_out = os.path.join(data_output_path, str(ts_id)+'.pkl')
    if not os.path.exists(f_out):
        ts = parse_transcript(get_transcript_fname_by_id(ts_path, ts_id))
        if ts is not None:
            ts['emotions'] = [goemotions(x)[0] for x in ts['sentences']]
            with open(f_out, 'wb') as f:
                pickle.dump(ts, f)



  soup = bs(xml_file)
  7%|▋         | 1497/22809 [41:47<17:48:06,  3.01s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors
  7%|▋         | 1497/22809 [42:07<9:59:40,  1.69s/it] 


RuntimeError: The size of tensor a (638) must match the size of tensor b (512) at non-singleton dimension 1