In [1]:
pip install pytorch-pretrained-bert transformers

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 2.9MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 8.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.8MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K  

In [2]:
import torch 

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [3]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
from pytorch_pretrained_bert import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2, output_attentions=False, output_hidden_states=False)

model.cuda()

optimizer = AdamW(model.parameters(), 
                lr=2e-5, # learning rate, default = 5e-5
                eps=1e-8 # adam_epsilon, default = 1e-8
                )

100%|██████████| 995526/995526 [00:00<00:00, 12283199.88B/s]


HBox(children=(IntProgress(value=0, description='Downloading', max=625, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=714314041, style=ProgressStyle(description_…




In [14]:
# Load the training sentences
print('Loading the training data...')

import pandas as pd

CSV_FILE_PATH = './drive/My Drive/Colab Notebooks/diploma_data/lgbt_homophobia_final.csv'

df = pd.read_csv(CSV_FILE_PATH, delimiter=',', header=None, names=['comment', 'label'])

print('Number of comments: {:,}\n'.format(df.shape[0]))
df.sample(5)

Loading the training data...
Number of comments: 5,906



Unnamed: 0,comment,label
5590,-*-*-*- Will I even be qualified by then?,1
1589,-*-*-*- God doesn't exist and you're living yo...,0
3464,"-*-*-*- I was raised in Argentina, but moved t...",1
1418,Oh dear only in in the east,0
3698,A lot of people are gay and they are frightene...,1


In [15]:
import gc

comments = df.comment.values
labels = df.label.values

# Tokenize all of the comments and map the tokens to their word IDs
input_ids=[]
input_labels=[]

for i,comment in enumerate(comments):
  # (1) Tokenize the comment.
  # (2) Prepend the '[CLS]' token to the start and append the '[SEP]' token to the end
  # (3) Map tokens to their IDs

  if isinstance(comment, float):
    continue

  tokenized_comment = tokenizer.tokenize(comment)
  tokenized_comment.insert(0, '[CLS]')
  tokenized_comment.append('[SEP]')

  if len(tokenized_comment) > 512:
    continue
  
  comment_ids = tokenizer.convert_tokens_to_ids(tokenized_comment)
  input_ids.append(comment_ids)
  input_labels.append(labels[i])
  
# Print the comment 0, now as a list of IDs
print('Number of comments in dataset: ', len(input_ids))
hate_count = 0
for label in input_labels:
  if label == 0:
    hate_count += 1
print('Percentage of hate comments in dataset: {:.3f}'.format(hate_count / len(input_ids)))
print('Max comment length: ', max([len(sen) for sen in input_ids]))

# Delete the comments and labels so I free memory so I can train normally
del comments
del labels
gc.collect()

Number of comments in dataset:  5895
Percentage of hate comments in dataset: 0.253
Max comment length:  446


0