NOTE: This notebook is specifically catered/tested using Google Colaboratory

In [None]:
!pip3 install simpletransformers

Simply using pip to install apex causes a conflict for python3. This is a way to
deal with the issue I found from
https://stackoverflow.com/questions/57284345/how-to-install-nvidia-apex-on-google-colab

In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
# Install apex
!sh setup.sh

In [None]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import re
from sklearn.metrics import classification_report

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

In [None]:
def getCleanReview(review):
  # Remove markdown and HTML tags
  clean_review = BeautifulSoup(review).get_text()
  # Remove escape character
  no_escape = clean_review.replace('\\', '')
  # get_text() surrounds the string in quotes, so return only whats inside those quotes
  return no_escape[1:-1]

def labeledDataForBert(data):
  reviews = list(data['review'])
  labels = list(data['sentiment'])
  clean_list = []
  for review, label in zip(reviews, labels):
    clean_list.append([getCleanReview(review), label])

  return pd.DataFrame(clean_list)

def unlabeledDataForBert(data):
  reviews = list(data['review'])
  clean_list = []
  for review in reviews:
    clean_list.append(getCleanReview(review))

  return clean_list

We used train_test_split() from sklearn to evaluate our model in the early stages. However, because RoBERTa models specifically benefit from having larger training sets, we submit without using a local test set

In [None]:
data = pd.read_csv('labeledTrainData.tsv', header=0, quoting=3, delimiter='\t')

train_data = labeledDataForBert(data)
print(f'Train data has {len(train_data)} entries')

In [None]:
model_args={'reprocess_input_data': False, # decreases train time if training on same data multiple times
            'overwrite_output_dir': True,
            'sliding_window': True, # better word context and part-of-speech prediction
            'num_train_epochs': 1,
            'do_lower_case': True, # does not take capitalization into account
            'save_eval_checkpoints': False, # decreases disk space used
            'save_model_every_epoch': False
            }

model = ClassificationModel('roberta', 'roberta-base', args=model_args)
model.train_model(train_data)

In [None]:
test_data = pd.read_csv('testData.tsv', header=0, quoting=3, delimiter='\t')

clean_test_data = unlabeledDataForBert(test_data)
predictions, _ = model.predict(clean_test_data)

In [None]:
submission_df = pd.DataFrame(data={'id': list(test_data['id']), 'sentiment': predictions})

# Remove quotation marks from test data 'id' field
clean_ids = []
for id in list(submission_df['id']):
  clean_id = re.sub('["]', '', id)
  clean_ids.append(clean_id)

submission_df['id'] = clean_ids
submission_df.set_index(keys=['id'], drop=True, inplace=True)
submission_df.to_csv('submission.csv')