In [12]:
import os
import sys

# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'GoEmotions-pytorch') )
sys.path.append(os.path.join(git_dir, 'lib', 'utils') )

# Define data path
data_path = os.path.join(git_dir, 'data', 'pm-transcripts')

# Define data output path
data_output_path = os.path.join(git_dir, 'data', 'pm-transcripts-processed')

In [6]:
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline
from pprint import pprint
from ipywidgets import IntProgress
from bs4 import BeautifulSoup as bs
import re
from tqdm import tqdm
import numpy as np 
import pandas as pd
import pkg_resources
from symspellpy import SymSpell, Verbosity
import pickle

### Build model

In [3]:
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = model.to('cuda:0')

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)

### Data

In [4]:
from xml_cleaner import get_transcript_fname_by_id, parse_transcript

In [13]:
# Make dataframe from index
ts_path = os.path.join(data_path, 'transcripts') # path folder of where pm-transcripts are stored
index_file_path = os.path.join(data_path, 'index.csv')
index_df = pd.read_csv(index_file_path)

# Make output dir
if not os.path.exists(data_output_path):
    os.makedirs(data_output_path)

### Iterate

In [None]:
# Choose PM
pm_name = 'Howard, John'

# Get all IDs
ts_ids = list(index_df[index_df['pm']==pm_name]['id'].astype(int))

# Iterate
for i, ts_id in enumerate(tqdm(ts_ids)):
    ts = parse_transcript(get_transcript_fname_by_id(ts_path, ts_id))
    if ts is not None:
        ts['emotions'] = [goemotions(x)[0] for x in ts['sentences']]
        f_out = os.path.join(data_output_path, str(ts_id)+'.pkl')
        with open(f_out, 'wb') as f:
            pickle.dump(ts, f)Z