In [1]:
import pandas as pd
import numpy as np
import re
from random import random
import emoji
from tqdm import notebook
def tqdm(x, **kargs):
    return notebook.tqdm(x, leave=False, **kargs)
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [2]:
from transformers import RobertaTokenizer, RobertaModel

I0603 00:10:32.220917 140658852493120 file_utils.py:39] PyTorch version 1.5.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Loading data

In [61]:
df_train = pd.read_json('./source/train_gold.json', lines=True)
categories_type = pd.read_json('./source/categories.json', lines=True)
df_dev = pd.read_json('./source/dev_unlabeled.json', lines=True)
df_test = pd.read_json('./source/test_unlabeled.json', lines=True)
print("Number of text in training data: {}".format(df_train.shape[0]))
print("Number of text in categories: {}".format(categories_type.shape[1]))
print("Number of text in developing data: {}".format(df_dev.shape[0]))
print("Number of text in testing data: {}".format(df_test.shape[0]))

Number of text in training data: 32000
Number of text in categories: 43
Number of text in developing data: 4000
Number of text in testing data: 4000


In [62]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

I0603 02:16:36.777105 140658852493120 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /home/ino/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
I0603 02:16:36.777707 140658852493120 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /home/ino/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0603 02:16:37.731026 140658852493120 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /home/ino/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b

In [63]:
roberta_vocab = pd.read_json('roberta_vocab/vocab.json', typ='series')

In [64]:
list(roberta_vocab.keys())

['<s>',
 '<pad>',
 '</s>',
 '<unk>',
 '.',
 'Ġthe',
 ',',
 'Ġto',
 'Ġand',
 'Ġof',
 'Ġa',
 'Ġin',
 '-',
 'Ġfor',
 'Ġthat',
 'Ġon',
 'Ġis',
 'âĢ',
 "'s",
 'Ġwith',
 'ĠThe',
 'Ġwas',
 'Ġ"',
 'Ġat',
 'Ġit',
 'Ġas',
 'Ġsaid',
 'Ļ',
 'Ġbe',
 's',
 'Ġby',
 'Ġfrom',
 'Ġare',
 'Ġhave',
 'Ġhas',
 ':',
 'Ġ(',
 'Ġhe',
 'ĠI',
 'Ġhis',
 'Ġwill',
 'Ġan',
 'Ġthis',
 ')',
 'ĠâĢ',
 'Ġnot',
 'Ŀ',
 'Ġyou',
 'ľ',
 'Ġtheir',
 'Ġor',
 'Ġthey',
 'Ġwe',
 'Ġbut',
 'Ġwho',
 'Ġmore',
 'Ġhad',
 'Ġbeen',
 'Ġwere',
 'Ġabout',
 ',"',
 'Ġwhich',
 'Ġup',
 'Ġits',
 'Ġcan',
 'Ġone',
 'Ġout',
 'Ġalso',
 'Ġ$',
 'Ġher',
 'Ġall',
 'Ġafter',
 '."',
 '/',
 'Ġwould',
 "'t",
 'Ġyear',
 'Ġwhen',
 'Ġfirst',
 'Ġshe',
 'Ġtwo',
 'Ġover',
 'Ġpeople',
 'ĠA',
 'Ġour',
 'ĠIt',
 'Ġtime',
 'Ġthan',
 'Ġinto',
 'Ġthere',
 't',
 'ĠHe',
 'Ġnew',
 'ĠâĢĶ',
 'Ġlast',
 'Ġjust',
 'ĠIn',
 'Ġother',
 'Ġso',
 'Ġwhat',
 'I',
 'Ġlike',
 'a',
 'Ġsome',
 'S',
 'Ã«',
 'Ġthem',
 'Ġyears',
 "'",
 'Ġdo',
 'Ġyour',
 'Ġ-',
 'Ġ1',
 '"',
 'Ġif',
 'Ġcould',
 '?',

## Start analyze

In [65]:
def get_vocab(corpus):
    vocabulary = Counter()
    for sentance in corpus:
        for word in sentance.split():
            vocabulary.update([word])
    return vocabulary

In [66]:
def check_coverage(vocabs, roberta_vocab):
    known_words = {}
    unknown_words = {}
    known_count = 0
    unknown_count = 0
    for word in tqdm(vocabs.keys(), desc='Checking: '):
        if word in list(roberta_vocab.keys()):
            known_words[word] = roberta_vocab[word]
            known_count += vocabs[word]
        else:
            unknown_words[word] = vocabs[word]
            unknown_count += vocabs[word]
    print("Found embeddings for {:.3%} ({} / {}) of vocab".format(len(known_words) / len(vocabs), len(known_words), len(vocabs)))
    print("Found embeddings for {:.3%} ({} / {}) of all text".format(known_count / (known_count + unknown_count), known_count, (known_count + unknown_count)))
    return unknown_words

In [67]:
train_text_vocab = get_vocab(df_train['text'].values)
train_reply_vocab = get_vocab(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 68961
train reply unique vocab count is: 25542


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=68961.0, style=ProgressStyle(description…

Found embeddings for 8.177% (5639 / 68961) of vocab
Found embeddings for 65.985% (432177 / 654963) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=25542.0, style=ProgressStyle(description…

Found embeddings for 14.451% (3691 / 25542) of vocab
Found embeddings for 63.198% (68504 / 108395) of all text


In [68]:
dev_text_vocab = get_vocab(df_dev['text'].values)
dev_reply_vocab = get_vocab(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 17684
dev reply unique vocab count is: 5360


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17684.0, style=ProgressStyle(description…

Found embeddings for 17.830% (3153 / 17684) of vocab
Found embeddings for 65.711% (54522 / 82972) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5360.0, style=ProgressStyle(description_…

Found embeddings for 26.978% (1446 / 5360) of vocab
Found embeddings for 62.345% (8626 / 13836) of all text


### Try to convert to lower case

In [69]:
def get_vocab_lower(corpus):
    vocabulary = Counter()
    for sentance in corpus:
        for word in sentance.lower().split():
            vocabulary.update([word])
    return vocabulary

In [70]:
train_text_vocab = get_vocab_lower(df_train['text'].values)
train_reply_vocab = get_vocab_lower(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text_lower = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply_lower = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 60613
train reply unique vocab count is: 22586


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=60613.0, style=ProgressStyle(description…

Found embeddings for 6.301% (3819 / 60613) of vocab
Found embeddings for 65.900% (431618 / 654963) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=22586.0, style=ProgressStyle(description…

Found embeddings for 11.804% (2666 / 22586) of vocab
Found embeddings for 63.508% (68840 / 108395) of all text


In [71]:
dev_text_vocab = get_vocab_lower(df_dev['text'].values)
dev_reply_vocab = get_vocab_lower(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 15888
dev reply unique vocab count is: 4818


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=15888.0, style=ProgressStyle(description…

Found embeddings for 14.772% (2347 / 15888) of vocab
Found embeddings for 65.687% (54502 / 82972) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4818.0, style=ProgressStyle(description_…

Found embeddings for 24.408% (1176 / 4818) of vocab
Found embeddings for 63.075% (8727 / 13836) of all text


#### Add some known in tokenizer but unknown in lower case (zero is weird)

In [27]:
# def add_lower(vocabs, roberta_vocab):
#     count = 0
#     add_tokens = []
#     for word in tqdm(vocabs, desc='Searching: '):
#         if word in list(roberta_vocab.keys()) and word.lower() not in list(roberta_vocab.keys()):
#             add_tokens.append(word.lower())
#             count += 1
#     print(add_tokens)
#     num_add = tokenizer.add_tokens(add_tokens)
#     model.resize_token_embeddings(len(tokenizer))
#     print("Added {} words to embedding".format(count))

In [28]:
# add_lower(train_text_vocab, roberta_vocab)
# add_lower(train_reply_vocab, roberta_vocab)

### Show some unknown tokens!

In [73]:
unknown_text

{'app,': 23,
 'hashtag': 23,
 '#bailouthumansnow': 24,
 'interviewing': 1,
 'incredible': 5,
 'exciting': 8,
 'soil': 1,
 'microbes': 1,
 'grasslands,': 1,
 "i'm": 192,
 'proud': 18,
 "i'll": 31,
 'phd': 4,
 'university': 5,
 'kansas': 1,
 'dr.': 72,
 'jim': 2,
 'bever': 1,
 'fellow!!': 1,
 'gtc': 1,
 'festival': 1,
 'happening': 4,
 'jus': 3,
 'confirmation': 1,
 'bai': 1,
 'lordy,': 1,
 'said,': 6,
 '“i': 7,
 'wonder': 16,
 'animals': 5,
 'cooperate': 1,
 'movie.”': 1,
 'we’re': 23,
 'lion': 1,
 '💀💀': 1,
 'unemployment': 4,
 'sucks': 5,
 'dick': 12,
 'believe': 33,
 'himself': 15,
 'power?': 7,
 'hunker': 4,
 '@primevideo': 4,
 'surprise!': 4,
 'enjoy': 16,
 'support!': 6,
 '“nancy': 11,
 'pelosi,': 20,
 'person.': 17,
 'leader.': 13,
 'america': 53,
 'hates': 18,
 'career': 14,
 'politicians,': 11,
 'yourself.”': 11,
 '@seanhannity': 13,
 'totally': 15,
 'incompetent': 11,
 'left,': 18,
 'pathetic': 11,
 'puppet.': 12,
 'washington': 21,
 'job!': 11,
 '#iwouldswapanythingfor': 2,
 "

## Clean weird punctuations

#### No significantly improve

In [75]:
def clean_weird(text):
    specials = ["’", "‘", "´", "`"]
    text = text.replace("’", "'")
    text = text.replace("‘", "'")
    text = text.replace("´", "'")
    text = text.replace("`", "'")
    return text

In [76]:
df_train['text'] = df_train.text.apply(clean_weird)
df_train['reply'] = df_train.reply.apply(clean_weird)

In [77]:
df_dev['text'] = df_dev.text.apply(clean_weird)
df_dev['reply'] = df_dev.reply.apply(clean_weird)

In [78]:
train_text_vocab = get_vocab(df_train['text'].values)
train_reply_vocab = get_vocab(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 68710
train reply unique vocab count is: 25436


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=68710.0, style=ProgressStyle(description…

Found embeddings for 8.211% (5642 / 68710) of vocab
Found embeddings for 65.987% (432193 / 654963) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=25436.0, style=ProgressStyle(description…

Found embeddings for 14.519% (3693 / 25436) of vocab
Found embeddings for 63.200% (68506 / 108395) of all text


In [79]:
dev_text_vocab = get_vocab_lower(df_dev['text'].values)
dev_reply_vocab = get_vocab_lower(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 15821
dev reply unique vocab count is: 4789


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=15821.0, style=ProgressStyle(description…

Found embeddings for 14.835% (2347 / 15821) of vocab
Found embeddings for 65.688% (54503 / 82972) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4789.0, style=ProgressStyle(description_…

Found embeddings for 24.577% (1177 / 4789) of vocab
Found embeddings for 63.082% (8728 / 13836) of all text


## Transform apostrophes

In [83]:
apostrophes = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [84]:
def change_apostrophes(text):
    # Replace apostrophes to original term
    for key in apostrophes.keys():
        text = text.replace(key, apostrophes[key])
    return text

In [85]:
df_train['text'] = df_train.text.apply(change_apostrophes)
df_train['reply'] = df_train.reply.apply(change_apostrophes)

In [86]:
df_dev['text'] = df_dev.text.apply(change_apostrophes)
df_dev['reply'] = df_dev.reply.apply(change_apostrophes)

In [87]:
train_text_vocab = get_vocab(df_train['text'].values)
train_reply_vocab = get_vocab(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 68558
train reply unique vocab count is: 25352


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=68558.0, style=ProgressStyle(description…

Found embeddings for 8.230% (5642 / 68558) of vocab
Found embeddings for 68.765% (460202 / 669242) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=25352.0, style=ProgressStyle(description…

Found embeddings for 14.571% (3694 / 25352) of vocab
Found embeddings for 66.097% (73228 / 110789) of all text


In [88]:
dev_text_vocab = get_vocab_lower(df_dev['text'].values)
dev_reply_vocab = get_vocab_lower(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 15783
dev reply unique vocab count is: 4763


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=15783.0, style=ProgressStyle(description…

Found embeddings for 14.883% (2349 / 15783) of vocab
Found embeddings for 68.515% (58102 / 84802) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4763.0, style=ProgressStyle(description_…

Found embeddings for 24.711% (1177 / 4763) of vocab
Found embeddings for 66.087% (9352 / 14151) of all text


In [89]:
unknown_text

{'app,': 23,
 'hashtag': 23,
 '#bailouthumansnow': 24,
 'interviewing': 1,
 'incredible': 5,
 'exciting': 8,
 'soil': 1,
 'microbes': 1,
 'grasslands,': 1,
 'proud': 18,
 'phd': 4,
 'university': 5,
 'kansas': 1,
 'dr.': 72,
 'jim': 2,
 'bever': 1,
 'fellow!!': 1,
 'gtc': 1,
 'festival': 1,
 'happening': 4,
 'jus': 3,
 'confirmation': 1,
 'bai': 1,
 'lordy,': 1,
 'said,': 6,
 '“i': 9,
 'wonder': 16,
 'animals': 5,
 'cooperate': 1,
 'movie.”': 1,
 "we're": 15,
 'lion': 1,
 '💀💀': 1,
 'unemployment': 4,
 'sucks': 5,
 'dick': 12,
 'believe': 33,
 'himself': 15,
 'power?': 7,
 'hunker': 4,
 '@primevideo': 4,
 'surprise!': 4,
 'enjoy': 16,
 'support!': 6,
 '“nancy': 11,
 'pelosi,': 20,
 'person.': 17,
 'leader.': 13,
 'america': 53,
 'hates': 18,
 'career': 14,
 'politicians,': 11,
 'yourself.”': 11,
 '@seanhannity': 13,
 'totally': 15,
 'incompetent': 11,
 'left,': 18,
 'pathetic': 11,
 'puppet.': 12,
 'washington': 21,
 'job!': 11,
 '#iwouldswapanythingfor': 2,
 "alzheimer's.": 1,
 '😢': 6,

## Check punctuations that roberta unknown 

In [90]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def unknown_punctuation(roberta_vocab):
    unknown = ''
    for char in punct:
        if char not in list(roberta_vocab.keys()):
            unknown += char
            unknown += ' '
    return unknown

In [91]:
print("Roberta unknown: ")
print(unknown_punctuation(roberta_vocab))

Roberta unknown: 
“ ” ’ ∞ θ α • − β ∅ π ‘ ₹ € ™ √ — – 


### Mapping unknown to known punctuations

In [92]:
def change_punc(text):
    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])
    for p in punct:
        text = text.replace(p, ' {} '.format(p))
    return text

In [93]:
df_train['map_punc_text'] = df_train.text.apply(change_punc)
df_train['map_punc_reply'] = df_train.reply.apply(change_punc)

In [94]:
df_dev['map_punc_text'] = df_dev.text.apply(change_punc)
df_dev['map_punc_reply'] = df_dev.reply.apply(change_punc)

In [95]:
train_text_vocab = get_vocab(df_train['map_punc_text'].values)
train_reply_vocab = get_vocab(df_train['map_punc_reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
print()

unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 46129
train reply unique vocab count is: 18569



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=46129.0, style=ProgressStyle(description…

Found embeddings for 13.464% (6211 / 46129) of vocab
Found embeddings for 80.351% (645864 / 803801) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=18569.0, style=ProgressStyle(description…

Found embeddings for 21.977% (4081 / 18569) of vocab
Found embeddings for 79.704% (110219 / 138285) of all text


In [96]:
dev_text_vocab = get_vocab_lower(df_dev['map_punc_text'].values)
dev_reply_vocab = get_vocab_lower(df_dev['map_punc_reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 11441
dev reply unique vocab count is: 3899


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=11441.0, style=ProgressStyle(description…

Found embeddings for 22.918% (2622 / 11441) of vocab
Found embeddings for 80.208% (81777 / 101956) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=3899.0, style=ProgressStyle(description_…

Found embeddings for 34.214% (1334 / 3899) of vocab
Found embeddings for 79.201% (13891 / 17539) of all text


In [97]:
sorted(unknown_text.items(), key=lambda d: d[1], reverse=True) 

[('cannot', 102),
 ('guys', 95),
 ('covid', 88),
 ('quarantine', 79),
 ('anyone', 74),
 ('hug', 72),
 ('america', 69),
 ('tonight', 57),
 ('myself', 56),
 ('coronavirus', 56),
 ('wanna', 54),
 ('fucking', 53),
 ('hear', 51),
 ('hope', 51),
 ('biden', 50),
 ('reaction', 49),
 ('tweet', 45),
 ('feeling', 45),
 ('gonna', 44),
 ('okay', 43),
 ('weeks', 41),
 ('trying', 40),
 ('lockdown', 40),
 ('followers', 39),
 ('birthday', 39),
 ('sad', 38),
 ('china', 38),
 ('tomorrow', 38),
 ('following', 38),
 ('wants', 38),
 ('anymore', 38),
 ('pandemic', 37),
 ('says', 37),
 ('gave', 37),
 ('saying', 37),
 ('pelosi', 36),
 ('already', 36),
 ('nancy', 34),
 ('yourself', 34),
 ('waiting', 34),
 ('doctors', 34),
 ('whole', 34),
 ('believe', 33),
 ('covid19', 33),
 ('virus', 32),
 ('realdonaldtrump', 32),
 ('bitch', 32),
 ('vaccine', 31),
 ('hugs', 30),
 ('choose', 29),
 ('retweet', 29),
 ('amazing', 29),
 ('honest', 28),
 ('wanted', 28),
 ('tired', 27),
 ('gifs', 27),
 ('😔', 27),
 ('asked', 27),
 ('fr

### Transform more words

In [99]:
more_apostrophes = {'cannot': "can not", 'gonna': "go to", 'wanna': "want to", 'coronavirus': "COVID", 'wanted': "want", 'weeks': "week", 'feeling': "feel", 'says': "say", 'yourself': "your self", 'saying': "say", 'says': "say", 'GIF': "gif", 'waiting': "wait", 'Covid': "COVID", 'hugs': "hug", 'gave': "give", 'COVID19': "COVID", 'installing': "install", 'wants': "want", 'knows': "know", 'describes': "describe", 'following': "follow", 'asked': "ask", 'amazing': "amaze", 'finally': "final", 'minutes': "minute", 'died': "die", 'tired': "tire", 'quickly': "quick", 'gotta': "go to", 'deaths': "death", 'means': "mean", 'took': "take", 'feels': "feel", 'fans': "fan", 'numbers': "number", 'lives': "live", 'safely': "safe", 'tried': "try", 'businesses': "business", '2nd': "second", 'decided': "decide", '3rd': "third", 'hates': "hate", 'dont': "do not", 'lonely': "lone", 'totally': "total", 'excited': "excite", 'BREAKING': "break", 'gifs': "gif", 'goes': "go", 'thoughts': "thought", 'campaigning': "campaign", 'immediately': "immediate", 'teammates': "team mate", 'knew': "know", 'politicians': "politician", 'distancing': "distance", 'reopening': "reopen", 'pls': "please", 'AGAIN': "again", 'tears': "tear", 'supposed': "suppose", 'loved': "love", 'ppl': "people", 'drinking': "drink", 'Guidelines': "guide line", 'losing': "lose", 'Conference': "conference", 'officially': "official", 'OPENING': "open", 'buying': "buy", 'Gif': "gif", 'looks': "look", 'bought': "buy", 'likes': "like", 'truely': "true", 'happened': "happen", 'putting': "put", 'families': "family", 'moved': "move", 'Raise': "raise", 'helped': "help", 'vibes': "vibe", 'voting': "vote", 'showed': "show", 'Instagram': "instagram", 'spent': "spend", 'watched': "watch", 'kinda': "kind of", 'Governor': "governor", 'Coronavirus': "COVID", 'lmao': "laugh", 'seems': "seem", 'staying': "stay", 'listening': "listen", 'accounts': "account"}
def change_punc(text):
    for key in more_apostrophes.keys():
        text = text.replace(key, more_apostrophes[key])
    return text

In [100]:
# tokenizer.tokenize('guide lines')

In [101]:
df_train['map_more_punc_text'] = df_train.map_punc_text.apply(change_punc)
df_train['map_more_punc_reply'] = df_train.map_punc_reply.apply(change_punc)

In [102]:
df_dev['map_more_punc_text'] = df_dev.map_punc_text.apply(change_punc)
df_dev['map_more_punc_reply'] = df_dev.map_punc_reply.apply(change_punc)

In [103]:
train_text_vocab = get_vocab(df_train['map_more_punc_text'].values)
train_reply_vocab = get_vocab(df_train['map_more_punc_reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
print()

unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 46010
train reply unique vocab count is: 18481



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=46010.0, style=ProgressStyle(description…

Found embeddings for 13.488% (6206 / 46010) of vocab
Found embeddings for 81.730% (658910 / 806199) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=18481.0, style=ProgressStyle(description…

Found embeddings for 22.066% (4078 / 18481) of vocab
Found embeddings for 80.666% (111902 / 138723) of all text


In [104]:
dev_text_vocab = get_vocab_lower(df_dev['map_more_punc_text'].values)
dev_reply_vocab = get_vocab_lower(df_dev['map_more_punc_reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 11398
dev reply unique vocab count is: 3854


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=11398.0, style=ProgressStyle(description…

Found embeddings for 22.986% (2620 / 11398) of vocab
Found embeddings for 81.460% (83292 / 102249) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=3854.0, style=ProgressStyle(description_…

Found embeddings for 34.587% (1333 / 3854) of vocab
Found embeddings for 80.247% (14125 / 17602) of all text


In [105]:
sorted(unknown_text.items(), key=lambda d: d[1], reverse=True) 

[('covid', 162),
 ('hug', 101),
 ('guys', 95),
 ('quarantine', 79),
 ('anyone', 74),
 ('america', 69),
 ('tonight', 57),
 ('myself', 56),
 ('fucking', 53),
 ('hear', 51),
 ('hope', 51),
 ('biden', 50),
 ('reaction', 49),
 ('tweet', 45),
 ('okay', 43),
 ('trying', 40),
 ('lockdown', 40),
 ('followers', 39),
 ('birthday', 39),
 ('sad', 38),
 ('china', 38),
 ('tomorrow', 38),
 ('anymore', 38),
 ('pandemic', 37),
 ('pelosi', 36),
 ('already', 36),
 ('nancy', 34),
 ('doctors', 34),
 ('whole', 34),
 ('believe', 33),
 ('reopen', 33),
 ('virus', 32),
 ('realdonaldtrump', 32),
 ('bitch', 32),
 ('vaccine', 31),
 ('choose', 29),
 ('retweet', 29),
 ('lose', 29),
 ('honest', 28),
 ('happen', 28),
 ('😔', 27),
 ('describe', 27),
 ('friday', 26),
 ('❤️', 26),
 ('😂', 26),
 ('hashtag', 25),
 ('yesterday', 25),
 ('😭', 25),
 ('bailouthumansnow', 24),
 ('😊', 24),
 ('horny', 24),
 ('spend', 24),
 ('dependable', 24),
 ('economy', 24),
 ('wish', 24),
 ('washington', 23),
 ('dinner', 23),
 ('eastern', 23),
 ('

In [None]:
# still_more_apostrophes = {

In [58]:
df_train.head()

Unnamed: 0,idx,text,reply,categories,mp4,map_punc_text,map_punc_reply,map_more_punc_text,map_more_punc_reply
0,0,we can all agree that any song by Niall Horan.,oui oui,[yes],6dc39e96b11275f064fdaed88273b45e.mp4,we can all agree that any song by Niall Horan .,oui oui,we can all agree that any song by Niall Horan .,oui oui
1,1,Will you be installing #ScottyFromMarketing's ...,,[no],cfff051f05d8d3b7136c7d58ea6ad55f.mp4,Will you be installing # ScottyFromMarketing ...,,Will you be install # ScottyFromMarketing ' ...,
2,2,Growing up my mum would call me a Nigga despit...,And he joins in??? Pour some hot grits on em,[smh],bf39e7bd9ad24354ce3ba6822b0104af.mp4,Growing up my mum would call me a Nigga despit...,And he joins in ? ? ? Pour some hot grits o...,Growing up my mum would call me a Nigga despit...,And he joins in ? ? ? Pour some hot grits o...
3,3,Rest your head on my chest when the world feel...,😂😂😂😂😂,[wink],173a707a04c277354a2f23cf01d6151e.mp4,Rest your head on my chest when the world feel...,😂😂😂😂😂,Rest your head on my chest when the world feel...,😂😂😂😂😂
4,4,Imagine Will Hernandez and Wills both doing a ...,,[yes],aab6d6bfb0c1382269ddba9b71cc8b7a.mp4,Imagine Will Hernandez and Wills both doing a ...,,Imagine Will Hernandez and Wills both doing a ...,


In [106]:
df_dev.head()

Unnamed: 0,idx,text,reply,map_punc_text,map_punc_reply,map_more_punc_text,map_more_punc_reply
0,32000,"Drop your cash app, use hashtag #BailoutHumansNow",$tyratomaro #BailoutHumans,"Drop your cash app , use hashtag # BailoutHu...",$ tyratomaro # BailoutHumans,"Drop your cash app , use hashtag # BailoutHu...",$ tyratomaro # BailoutHumans
1,32001,After interviewing with a few incredible peopl...,CONGRATS!!!!!,After interviewing with a few incredible peopl...,CONGRATS ! ! ! ! !,After interviewing with a few incredible peopl...,CONGRATS ! ! ! ! !
2,32002,I know GTC festival not happening next month b...,,I know GTC festival not happening next month b...,,I know GTC festival not happening next month b...,
3,32003,"Lordy, my daughter just said, “I wonder how th...",,"Lordy , my daughter just said , "" I wonde...",,"Lordy , my daughter just said , "" I wonde...",
4,32004,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...


## Output preprocessed to json

In [107]:
df_preprocessed = df_train[['idx', 'map_more_punc_text', 'map_more_punc_reply', 'categories']].copy()
df_preprocessed.columns = ['idx', 'text', 'reply', 'categories']
df_preprocessed.to_json('./preprocessed/preprocess_train.json', orient='records', lines=True)

In [108]:
df_preprocessed_dev = df_dev[['idx', 'map_more_punc_text', 'map_more_punc_reply']].copy()
df_preprocessed_dev.columns = ['idx', 'text', 'reply']
df_preprocessed_dev.to_json('./preprocessed/preprocess_dev.json', orient='records', lines=True)