In [1]:
import pandas as pd
import numpy as np
import re
from random import random
import emoji
from tqdm import notebook
def tqdm(x, **kargs):
    return notebook.tqdm(x, leave=False, **kargs)
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [2]:
from transformers import RobertaTokenizer, RobertaModel

I0604 23:38:24.530949 140680772192064 file_utils.py:39] PyTorch version 1.5.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Loading data

In [3]:
df_train = pd.read_json('./source/train_gold.json', lines=True)
categories_type = pd.read_json('./source/categories.json', lines=True)
df_dev = pd.read_json('./source/dev_unlabeled.json', lines=True)
df_test = pd.read_json('./source/test_unlabeled.json', lines=True)
print("Number of text in training data: {}".format(df_train.shape[0]))
print("Number of text in categories: {}".format(categories_type.shape[1]))
print("Number of text in developing data: {}".format(df_dev.shape[0]))
print("Number of text in testing data: {}".format(df_test.shape[0]))

Number of text in training data: 32000
Number of text in categories: 43
Number of text in developing data: 4000
Number of text in testing data: 4000


In [4]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

I0604 23:38:28.482975 140680772192064 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /home/ino/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
I0604 23:38:28.484291 140680772192064 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /home/ino/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0604 23:38:29.440285 140680772192064 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /home/ino/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b

In [5]:
roberta_vocab = pd.read_json('roberta_vocab/vocab.json', typ='series')

In [6]:
list(roberta_vocab.keys())

['<s>',
 '<pad>',
 '</s>',
 '<unk>',
 '.',
 'Ġthe',
 ',',
 'Ġto',
 'Ġand',
 'Ġof',
 'Ġa',
 'Ġin',
 '-',
 'Ġfor',
 'Ġthat',
 'Ġon',
 'Ġis',
 'âĢ',
 "'s",
 'Ġwith',
 'ĠThe',
 'Ġwas',
 'Ġ"',
 'Ġat',
 'Ġit',
 'Ġas',
 'Ġsaid',
 'Ļ',
 'Ġbe',
 's',
 'Ġby',
 'Ġfrom',
 'Ġare',
 'Ġhave',
 'Ġhas',
 ':',
 'Ġ(',
 'Ġhe',
 'ĠI',
 'Ġhis',
 'Ġwill',
 'Ġan',
 'Ġthis',
 ')',
 'ĠâĢ',
 'Ġnot',
 'Ŀ',
 'Ġyou',
 'ľ',
 'Ġtheir',
 'Ġor',
 'Ġthey',
 'Ġwe',
 'Ġbut',
 'Ġwho',
 'Ġmore',
 'Ġhad',
 'Ġbeen',
 'Ġwere',
 'Ġabout',
 ',"',
 'Ġwhich',
 'Ġup',
 'Ġits',
 'Ġcan',
 'Ġone',
 'Ġout',
 'Ġalso',
 'Ġ$',
 'Ġher',
 'Ġall',
 'Ġafter',
 '."',
 '/',
 'Ġwould',
 "'t",
 'Ġyear',
 'Ġwhen',
 'Ġfirst',
 'Ġshe',
 'Ġtwo',
 'Ġover',
 'Ġpeople',
 'ĠA',
 'Ġour',
 'ĠIt',
 'Ġtime',
 'Ġthan',
 'Ġinto',
 'Ġthere',
 't',
 'ĠHe',
 'Ġnew',
 'ĠâĢĶ',
 'Ġlast',
 'Ġjust',
 'ĠIn',
 'Ġother',
 'Ġso',
 'Ġwhat',
 'I',
 'Ġlike',
 'a',
 'Ġsome',
 'S',
 'Ã«',
 'Ġthem',
 'Ġyears',
 "'",
 'Ġdo',
 'Ġyour',
 'Ġ-',
 'Ġ1',
 '"',
 'Ġif',
 'Ġcould',
 '?',

## Start analyze

In [7]:
def get_vocab(corpus):
    vocabulary = Counter()
    for sentance in corpus:
        for word in sentance.split():
            vocabulary.update([word])
    return vocabulary

In [8]:
def check_coverage(vocabs, roberta_vocab):
    known_words = {}
    unknown_words = {}
    known_count = 0
    unknown_count = 0
    for word in tqdm(vocabs.keys(), desc='Checking: '):
        if word in list(roberta_vocab.keys()):
            known_words[word] = roberta_vocab[word]
            known_count += vocabs[word]
        else:
            unknown_words[word] = vocabs[word]
            unknown_count += vocabs[word]
    print("Found embeddings for {:.3%} ({} / {}) of vocab".format(len(known_words) / len(vocabs), len(known_words), len(vocabs)))
    print("Found embeddings for {:.3%} ({} / {}) of all text".format(known_count / (known_count + unknown_count), known_count, (known_count + unknown_count)))
    return unknown_words

In [9]:
train_text_vocab = get_vocab(df_train['text'].values)
train_reply_vocab = get_vocab(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 68961
train reply unique vocab count is: 25542


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=68961.0, style=ProgressStyle(description…

Found embeddings for 8.177% (5639 / 68961) of vocab
Found embeddings for 65.985% (432177 / 654963) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=25542.0, style=ProgressStyle(description…

Found embeddings for 14.451% (3691 / 25542) of vocab
Found embeddings for 63.198% (68504 / 108395) of all text


In [10]:
dev_text_vocab = get_vocab(df_dev['text'].values)
dev_reply_vocab = get_vocab(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 17684
dev reply unique vocab count is: 5360


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17684.0, style=ProgressStyle(description…

Found embeddings for 17.830% (3153 / 17684) of vocab
Found embeddings for 65.711% (54522 / 82972) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5360.0, style=ProgressStyle(description_…

Found embeddings for 26.978% (1446 / 5360) of vocab
Found embeddings for 62.345% (8626 / 13836) of all text


In [11]:
test_text_vocab = get_vocab(df_test['text'].values)
test_reply_vocab = get_vocab(df_test['reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 17338
test reply unique vocab count is: 5187


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17338.0, style=ProgressStyle(description…

Found embeddings for 18.191% (3154 / 17338) of vocab
Found embeddings for 66.134% (54166 / 81903) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5187.0, style=ProgressStyle(description_…

Found embeddings for 28.070% (1456 / 5187) of vocab
Found embeddings for 63.152% (8360 / 13238) of all text


### Try to convert to lower case

In [12]:
def get_vocab_lower(corpus):
    vocabulary = Counter()
    for sentance in corpus:
        for word in sentance.lower().split():
            vocabulary.update([word])
    return vocabulary

In [13]:
train_text_vocab = get_vocab_lower(df_train['text'].values)
train_reply_vocab = get_vocab_lower(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text_lower = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply_lower = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 60613
train reply unique vocab count is: 22586


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=60613.0, style=ProgressStyle(description…

Found embeddings for 6.301% (3819 / 60613) of vocab
Found embeddings for 65.900% (431618 / 654963) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=22586.0, style=ProgressStyle(description…

Found embeddings for 11.804% (2666 / 22586) of vocab
Found embeddings for 63.508% (68840 / 108395) of all text


In [14]:
dev_text_vocab = get_vocab_lower(df_dev['text'].values)
dev_reply_vocab = get_vocab_lower(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 15888
dev reply unique vocab count is: 4818


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=15888.0, style=ProgressStyle(description…

Found embeddings for 14.772% (2347 / 15888) of vocab
Found embeddings for 65.687% (54502 / 82972) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4818.0, style=ProgressStyle(description_…

Found embeddings for 24.408% (1176 / 4818) of vocab
Found embeddings for 63.075% (8727 / 13836) of all text


In [15]:
test_text_vocab = get_vocab_lower(df_test['text'].values)
test_reply_vocab = get_vocab_lower(df_test['reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 15473
test reply unique vocab count is: 4674


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=15473.0, style=ProgressStyle(description…

Found embeddings for 15.130% (2341 / 15473) of vocab
Found embeddings for 66.148% (54177 / 81903) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4674.0, style=ProgressStyle(description_…

Found embeddings for 25.246% (1180 / 4674) of vocab
Found embeddings for 63.484% (8404 / 13238) of all text


#### Add some known in tokenizer but unknown in lower case (zero is weird)

In [16]:
# def add_lower(vocabs, roberta_vocab):
#     count = 0
#     add_tokens = []
#     for word in tqdm(vocabs, desc='Searching: '):
#         if word in list(roberta_vocab.keys()) and word.lower() not in list(roberta_vocab.keys()):
#             add_tokens.append(word.lower())
#             count += 1
#     print(add_tokens)
#     num_add = tokenizer.add_tokens(add_tokens)
#     model.resize_token_embeddings(len(tokenizer))
#     print("Added {} words to embedding".format(count))

In [17]:
# add_lower(train_text_vocab, roberta_vocab)
# add_lower(train_reply_vocab, roberta_vocab)

### Show some unknown tokens!

In [18]:
unknown_text

{'@youngdeji_': 1,
 'uzi': 2,
 'carti': 4,
 'monday': 3,
 'gotta': 18,
 'lil': 11,
 'woah': 1,
 'we’re': 27,
 'discussing': 5,
 'trading': 4,
 'picks': 2,
 'safety.': 1,
 'dababy': 4,
 'sounds': 6,
 'niggas': 10,
 'kitchen': 7,
 'denny’s': 1,
 'indians': 3,
 'don’t': 126,
 'sport': 2,
 'cricket.': 1,
 'would’ve': 3,
 'came': 17,
 'out.': 21,
 'zaira': 1,
 'wasim': 1,
 'hardwork': 1,
 'screen.': 1,
 'justify': 2,
 'mentality': 1,
 'sick.': 4,
 'everybody': 8,
 'listening': 8,
 '@madisonbeer': 1,
 'selfish': 1,
 'i’ve': 57,
 '“as': 2,
 'please”': 1,
 'wtf': 8,
 '@lupeloops': 1,
 'weekend': 10,
 '😓': 3,
 "haven't": 7,
 'ops,': 1,
 "ain't": 6,
 'biggest': 9,
 'disappointments....': 1,
 'hardest': 4,
 'work.': 6,
 'cried.': 1,
 'hospitals': 4,
 'settings,': 1,
 'home,': 3,
 'it.': 94,
 'crap': 3,
 'nightmare.': 3,
 'accounts': 8,
 'hacked': 5,
 'spamming': 4,
 'mad.': 4,
 'couldn’t': 10,
 'back.': 14,
 'tried': 14,
 'confronting': 4,
 'hacker': 3,
 'replying': 5,
 '“who': 4,
 'are?!”': 3,
 

## Clean weird punctuations

#### No significantly improve

In [19]:
def clean_weird(text):
    specials = ["’", "‘", "´", "`"]
    text = text.replace("’", "'")
    text = text.replace("‘", "'")
    text = text.replace("´", "'")
    text = text.replace("`", "'")
    return text

In [20]:
df_train['text'] = df_train.text.apply(clean_weird)
df_train['reply'] = df_train.reply.apply(clean_weird)

In [21]:
df_dev['text'] = df_dev.text.apply(clean_weird)
df_dev['reply'] = df_dev.reply.apply(clean_weird)

In [22]:
df_test['text'] = df_test.text.apply(clean_weird)
df_test['reply'] = df_test.reply.apply(clean_weird)

In [23]:
train_text_vocab = get_vocab(df_train['text'].values)
train_reply_vocab = get_vocab(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 68710
train reply unique vocab count is: 25436


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=68710.0, style=ProgressStyle(description…

Found embeddings for 8.211% (5642 / 68710) of vocab
Found embeddings for 65.987% (432193 / 654963) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=25436.0, style=ProgressStyle(description…

Found embeddings for 14.519% (3693 / 25436) of vocab
Found embeddings for 63.200% (68506 / 108395) of all text


In [24]:
dev_text_vocab = get_vocab(df_dev['text'].values)
dev_reply_vocab = get_vocab(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 17597
dev reply unique vocab count is: 5322


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17597.0, style=ProgressStyle(description…

Found embeddings for 17.918% (3153 / 17597) of vocab
Found embeddings for 65.713% (54523 / 82972) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5322.0, style=ProgressStyle(description_…

Found embeddings for 27.189% (1447 / 5322) of vocab
Found embeddings for 62.352% (8627 / 13836) of all text


In [25]:
test_text_vocab = get_vocab(df_test['text'].values)
test_reply_vocab = get_vocab(df_test['reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 17246
test reply unique vocab count is: 5152


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17246.0, style=ProgressStyle(description…

Found embeddings for 18.294% (3155 / 17246) of vocab
Found embeddings for 66.136% (54167 / 81903) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5152.0, style=ProgressStyle(description_…

Found embeddings for 28.261% (1456 / 5152) of vocab
Found embeddings for 63.152% (8360 / 13238) of all text


## Transform apostrophes

In [26]:
apostrophes = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [27]:
def change_apostrophes(text):
    # Replace apostrophes to original term
    for key in apostrophes.keys():
        text = text.replace(key, apostrophes[key])
    return text

In [28]:
df_train['text'] = df_train.text.apply(change_apostrophes)
df_train['reply'] = df_train.reply.apply(change_apostrophes)

In [29]:
df_dev['text'] = df_dev.text.apply(change_apostrophes)
df_dev['reply'] = df_dev.reply.apply(change_apostrophes)

In [30]:
df_test['text'] = df_test.text.apply(change_apostrophes)
df_test['reply'] = df_test.reply.apply(change_apostrophes)

In [31]:
train_text_vocab = get_vocab(df_train['text'].values)
train_reply_vocab = get_vocab(df_train['reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 68558
train reply unique vocab count is: 25352


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=68558.0, style=ProgressStyle(description…

Found embeddings for 8.230% (5642 / 68558) of vocab
Found embeddings for 68.765% (460202 / 669242) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=25352.0, style=ProgressStyle(description…

Found embeddings for 14.571% (3694 / 25352) of vocab
Found embeddings for 66.097% (73228 / 110789) of all text


In [32]:
dev_text_vocab = get_vocab(df_dev['text'].values)
dev_reply_vocab = get_vocab(df_dev['reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 17524
dev reply unique vocab count is: 5273


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17524.0, style=ProgressStyle(description…

Found embeddings for 18.004% (3155 / 17524) of vocab
Found embeddings for 68.538% (58122 / 84802) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5273.0, style=ProgressStyle(description_…

Found embeddings for 27.442% (1447 / 5273) of vocab
Found embeddings for 65.373% (9251 / 14151) of all text


In [33]:
test_text_vocab = get_vocab(df_test['text'].values)
test_reply_vocab = get_vocab(df_test['reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 17171
test reply unique vocab count is: 5108


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17171.0, style=ProgressStyle(description…

Found embeddings for 18.380% (3156 / 17171) of vocab
Found embeddings for 68.775% (57497 / 83601) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=5108.0, style=ProgressStyle(description_…

Found embeddings for 28.504% (1456 / 5108) of vocab
Found embeddings for 66.051% (8938 / 13532) of all text


In [34]:
unknown_text

{'@Youngdeji_': 1,
 'uzi': 2,
 'carti': 3,
 'gotta': 16,
 'lil': 10,
 'woah': 1,
 'discussing': 5,
 'trading': 4,
 'picks': 2,
 'safety.': 1,
 'dababy': 3,
 'sounds': 5,
 'niggas': 6,
 'kitchen': 7,
 "Denny's": 1,
 'Majority': 2,
 'Indians': 3,
 'sport': 2,
 'cricket.': 1,
 'came': 16,
 'out.': 21,
 'Zaira': 1,
 'Wasim': 1,
 'hardwork': 1,
 'screen.': 1,
 'justify': 2,
 'mentality': 1,
 'sick.': 4,
 'everybody': 5,
 'listening': 8,
 '@madisonbeer': 1,
 'selfish': 1,
 '“as': 2,
 'please”': 1,
 'wtf': 5,
 'Might': 4,
 '@LupeLoops': 1,
 'weekend': 9,
 '😓': 3,
 'ops,': 1,
 'biggest': 9,
 'disappointments....': 1,
 'hardest': 4,
 'work.': 6,
 'cried.': 1,
 'hospitals': 4,
 'settings,': 1,
 'home,': 3,
 'it.': 93,
 'crap': 3,
 'nightmare.': 3,
 'accounts': 8,
 'hacked': 5,
 'spamming': 4,
 'mad.': 4,
 'back.': 13,
 'tried': 13,
 'confronting': 4,
 'hacker': 3,
 'replying': 5,
 '“Who': 3,
 'are?!”': 3,
 'PHYSICALLY': 3,
 'attacked': 4,
 'spam': 3,
 'messages': 6,
 'woke': 15,
 'up.': 25,
 '😰'

## Check punctuations that roberta unknown 

In [35]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def unknown_punctuation(roberta_vocab):
    unknown = ''
    for char in punct:
        if char not in list(roberta_vocab.keys()):
            unknown += char
            unknown += ' '
    return unknown

In [36]:
print("Roberta unknown: ")
print(unknown_punctuation(roberta_vocab))

Roberta unknown: 
“ ” ’ ∞ θ α • − β ∅ π ‘ ₹ € ™ √ — – 


### Mapping unknown to known punctuations

In [37]:
def change_punc(text):
    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])
    for p in punct:
        text = text.replace(p, ' {} '.format(p))
    return text

In [38]:
df_train['map_punc_text'] = df_train.text.apply(change_punc)
df_train['map_punc_reply'] = df_train.reply.apply(change_punc)

In [39]:
df_dev['map_punc_text'] = df_dev.text.apply(change_punc)
df_dev['map_punc_reply'] = df_dev.reply.apply(change_punc)

In [40]:
df_test['map_punc_text'] = df_test.text.apply(change_punc)
df_test['map_punc_reply'] = df_test.reply.apply(change_punc)

In [41]:
train_text_vocab = get_vocab(df_train['map_punc_text'].values)
train_reply_vocab = get_vocab(df_train['map_punc_reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
print()

unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 46129
train reply unique vocab count is: 18569



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=46129.0, style=ProgressStyle(description…

Found embeddings for 13.464% (6211 / 46129) of vocab
Found embeddings for 80.351% (645864 / 803801) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=18569.0, style=ProgressStyle(description…

Found embeddings for 21.977% (4081 / 18569) of vocab
Found embeddings for 79.704% (110219 / 138285) of all text


In [42]:
dev_text_vocab = get_vocab(df_dev['map_punc_text'].values)
dev_reply_vocab = get_vocab(df_dev['map_punc_reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 13357
dev reply unique vocab count is: 4473


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=13357.0, style=ProgressStyle(description…

Found embeddings for 26.698% (3566 / 13357) of vocab
Found embeddings for 80.274% (81844 / 101956) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4473.0, style=ProgressStyle(description_…

Found embeddings for 36.977% (1654 / 4473) of vocab
Found embeddings for 78.665% (13797 / 17539) of all text


In [43]:
test_text_vocab = get_vocab(df_test['map_punc_text'].values)
test_reply_vocab = get_vocab(df_test['map_punc_reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 13174
test reply unique vocab count is: 4263


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=13174.0, style=ProgressStyle(description…

Found embeddings for 27.228% (3587 / 13174) of vocab
Found embeddings for 80.222% (80192 / 99962) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4263.0, style=ProgressStyle(description_…

Found embeddings for 39.268% (1674 / 4263) of vocab
Found embeddings for 79.877% (13548 / 16961) of all text


In [44]:
sorted(unknown_text.items(), key=lambda d: d[1], reverse=True) 

[('hug', 96),
 ('cannot', 90),
 ('quarantine', 68),
 ('tonight', 64),
 ('myself', 58),
 ('gonna', 55),
 ('Biden', 52),
 ('reaction', 48),
 ('tomorrow', 47),
 ('fucking', 43),
 ('coronavirus', 43),
 ('wanna', 43),
 ('Pelosi', 42),
 ('Nancy', 40),
 ('hear', 39),
 ('feeling', 39),
 ('guys', 39),
 ('weeks', 39),
 ('Oz', 39),
 ('anyone', 39),
 ('COVID', 38),
 ('believe', 37),
 ('trying', 36),
 ('followers', 36),
 ('yourself', 36),
 ('tweet', 36),
 ('died', 36),
 ('lockdown', 36),
 ('least', 35),
 ('gave', 35),
 ('sad', 34),
 ('GIF', 34),
 ('hope', 34),
 ('birthday', 33),
 ('anymore', 33),
 ('Covid', 32),
 ('COVID19', 31),
 ('economy', 31),
 ('says', 30),
 ('whole', 30),
 ('wants', 29),
 ('virus', 29),
 ('already', 28),
 ('knows', 28),
 ('wanted', 27),
 ('pandemic', 27),
 ('fans', 27),
 ('hashtag', 26),
 ('beautiful', 26),
 ('dinner', 26),
 ('saying', 25),
 ('wish', 25),
 ('weekend', 24),
 ('hugs', 24),
 ('BailoutHumansNow', 24),
 ('totally', 24),
 ('dependable', 24),
 ('forget', 24),
 ('😂',

### Transform more words

In [45]:
more_apostrophes = {'cannot': "can not", 'gonna': "go to", 'wanna': "want to", 'coronavirus': "COVID", 'wanted': "want", 'weeks': "week", 'feeling': "feel", 'says': "say", 'yourself': "your self", 'saying': "say", 'says': "say", 'GIF': "gif", 'waiting': "wait", 'Covid': "COVID", 'hugs': "hug", 'gave': "give", 'COVID19': "COVID", 'installing': "install", 'wants': "want", 'knows': "know", 'describes': "describe", 'following': "follow", 'asked': "ask", 'amazing': "amaze", 'finally': "final", 'minutes': "minute", 'died': "die", 'tired': "tire", 'quickly': "quick", 'gotta': "go to", 'deaths': "death", 'means': "mean", 'took': "take", 'feels': "feel", 'fans': "fan", 'numbers': "number", 'lives': "live", 'safely': "safe", 'tried': "try", 'businesses': "business", '2nd': "second", 'decided': "decide", '3rd': "third", 'hates': "hate", 'dont': "do not", 'lonely': "lone", 'totally': "total", 'excited': "excite", 'BREAKING': "break", 'gifs': "gif", 'goes': "go", 'thoughts': "thought", 'campaigning': "campaign", 'immediately': "immediate", 'teammates': "team mate", 'knew': "know", 'politicians': "politician", 'distancing': "distance", 'reopening': "reopen", 'pls': "please", 'AGAIN': "again", 'tears': "tear", 'supposed': "suppose", 'loved': "love", 'ppl': "people", 'drinking': "drink", 'Guidelines': "guide line", 'losing': "lose", 'Conference': "conference", 'officially': "official", 'OPENING': "open", 'buying': "buy", 'Gif': "gif", 'looks': "look", 'bought': "buy", 'likes': "like", 'truely': "true", 'happened': "happen", 'putting': "put", 'families': "family", 'moved': "move", 'Raise': "raise", 'helped': "help", 'vibes': "vibe", 'voting': "vote", 'showed': "show", 'Instagram': "instagram", 'spent': "spend", 'watched': "watch", 'kinda': "kind of", 'Governor': "governor", 'Coronavirus': "COVID", 'lmao': "laugh", 'seems': "seem", 'staying': "stay", 'listening': "listen", 'accounts': "account"}
def change_punc(text):
    for key in more_apostrophes.keys():
        text = text.replace(key, more_apostrophes[key])
    return text

In [46]:
# tokenizer.tokenize('guide lines')

In [47]:
df_train['map_more_punc_text'] = df_train.map_punc_text.apply(change_punc)
df_train['map_more_punc_reply'] = df_train.map_punc_reply.apply(change_punc)

In [48]:
df_dev['map_more_punc_text'] = df_dev.map_punc_text.apply(change_punc)
df_dev['map_more_punc_reply'] = df_dev.map_punc_reply.apply(change_punc)

In [49]:
df_test['map_more_punc_text'] = df_test.map_punc_text.apply(change_punc)
df_test['map_more_punc_reply'] = df_test.map_punc_reply.apply(change_punc)

In [50]:
train_text_vocab = get_vocab(df_train['map_more_punc_text'].values)
train_reply_vocab = get_vocab(df_train['map_more_punc_reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
print()

unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 46010
train reply unique vocab count is: 18481



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=46010.0, style=ProgressStyle(description…

Found embeddings for 13.488% (6206 / 46010) of vocab
Found embeddings for 81.730% (658910 / 806199) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=18481.0, style=ProgressStyle(description…

Found embeddings for 22.066% (4078 / 18481) of vocab
Found embeddings for 80.666% (111902 / 138723) of all text


In [51]:
dev_text_vocab = get_vocab(df_dev['map_more_punc_text'].values)
dev_reply_vocab = get_vocab(df_dev['map_more_punc_reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 13257
dev reply unique vocab count is: 4415


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=13257.0, style=ProgressStyle(description…

Found embeddings for 26.869% (3562 / 13257) of vocab
Found embeddings for 81.635% (83471 / 102249) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4415.0, style=ProgressStyle(description_…

Found embeddings for 37.531% (1657 / 4415) of vocab
Found embeddings for 79.735% (14035 / 17602) of all text


In [52]:
test_text_vocab = get_vocab(df_test['map_more_punc_text'].values)
test_reply_vocab = get_vocab(df_test['map_more_punc_reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 13076
test reply unique vocab count is: 4209


HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=13076.0, style=ProgressStyle(description…

Found embeddings for 27.432% (3587 / 13076) of vocab
Found embeddings for 81.643% (81863 / 100269) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4209.0, style=ProgressStyle(description_…

Found embeddings for 39.796% (1675 / 4209) of vocab
Found embeddings for 80.731% (13734 / 17012) of all text


In [53]:
sorted(unknown_text.items(), key=lambda d: d[1], reverse=True) 

[('COVID', 158),
 ('hug', 120),
 ('quarantine', 68),
 ('tonight', 64),
 ('myself', 58),
 ('Biden', 52),
 ('reaction', 48),
 ('tomorrow', 47),
 ('fucking', 43),
 ('Pelosi', 42),
 ('Nancy', 40),
 ('hear', 39),
 ('guys', 39),
 ('Oz', 39),
 ('anyone', 39),
 ('believe', 37),
 ('trying', 36),
 ('followers', 36),
 ('tweet', 36),
 ('lockdown', 36),
 ('least', 35),
 ('sad', 34),
 ('hope', 34),
 ('birthday', 33),
 ('anymore', 33),
 ('economy', 31),
 ('whole', 30),
 ('virus', 29),
 ('already', 28),
 ('pandemic', 27),
 ('hashtag', 26),
 ('beautiful', 26),
 ('dinner', 26),
 ('decide', 25),
 ('wish', 25),
 ('weekend', 24),
 ('BailoutHumansNow', 24),
 ('politician', 24),
 ('dependable', 24),
 ('forget', 24),
 ('😂', 24),
 ('spend', 24),
 ('ScottyFromMarketing', 23),
 ('❤️', 23),
 ('🤔', 23),
 ('honest', 23),
 ('career', 23),
 ('happen', 23),
 ('dick', 22),
 ('explain', 22),
 ('reopen', 21),
 ('amaze', 21),
 ('tire', 21),
 ('Crazy', 21),
 ('sick', 20),
 ('suck', 20),
 ('thankful', 20),
 ('horny', 20),
 

### Try demojize to text and unique same emoji

In [69]:
import emoji

In [107]:
def distinct_emoji_lis(string):
    """Resturns distinct list of emojis from the string"""
    distinct_list = list({c for c in string if c in emoji.unicode_codes.UNICODE_EMOJI})
    return distinct_list

In [108]:
def change_emoji_to_text(text):
    """
    Input: text
    Output: demojize text
    """
    ori_text = text
    distinct_emoji = distinct_emoji_lis(text)
    for each_emoji in distinct_emoji:
        first_appear = ori_text.index(each_emoji)
        new_text = ''
        for tid, token in enumerate(ori_text):
            if token == each_emoji and tid != first_appear:
                new_text += ''
            else:
                new_text += token
        ori_text = new_text
    ori_text = emoji.demojize(ori_text).replace(':', ' ').replace('_', ' ')
    return ori_text

In [109]:
df_train['map_demojize_text'] = df_train.map_more_punc_text.apply(change_emoji_to_text)
df_train['map_demojize_reply'] = df_train.map_more_punc_reply.apply(change_emoji_to_text)

In [110]:
df_dev['map_demojize_text'] = df_dev.map_more_punc_text.apply(change_emoji_to_text)
df_dev['map_demojize_reply'] = df_dev.map_more_punc_reply.apply(change_emoji_to_text)

In [111]:
df_test['map_demojize_text'] = df_test.map_more_punc_text.apply(change_emoji_to_text)
df_test['map_demojize_reply'] = df_test.map_more_punc_reply.apply(change_emoji_to_text)

In [123]:
train_text_vocab = get_vocab(df_train['map_demojize_text'].values)
train_reply_vocab = get_vocab(df_train['map_demojize_reply'].values)
print("train text unique vocab count is: {}".format(len(train_text_vocab)))
print("train reply unique vocab count is: {}".format(len(train_reply_vocab)))
print()

unknown_text = check_coverage(train_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(train_reply_vocab, roberta_vocab)

train text unique vocab count is: 43503
train reply unique vocab count is: 17184



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=43503.0, style=ProgressStyle(description…

Found embeddings for 14.300% (6221 / 43503) of vocab
Found embeddings for 81.902% (674962 / 824110) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=17184.0, style=ProgressStyle(description…

Found embeddings for 23.889% (4105 / 17184) of vocab
Found embeddings for 81.091% (120173 / 148196) of all text


In [113]:
dev_text_vocab = get_vocab(df_dev['map_demojize_text'].values)
dev_reply_vocab = get_vocab(df_dev['map_demojize_reply'].values)
print("dev text unique vocab count is: {}".format(len(dev_text_vocab)))
print("dev reply unique vocab count is: {}".format(len(dev_reply_vocab)))
print()

unknown_text = check_coverage(dev_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(dev_reply_vocab, roberta_vocab)

dev text unique vocab count is: 12887
dev reply unique vocab count is: 4312



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=12887.0, style=ProgressStyle(description…

Found embeddings for 27.826% (3586 / 12887) of vocab
Found embeddings for 81.828% (85413 / 104381) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4312.0, style=ProgressStyle(description_…

Found embeddings for 39.471% (1702 / 4312) of vocab
Found embeddings for 80.221% (15153 / 18889) of all text


In [114]:
test_text_vocab = get_vocab(df_test['map_demojize_text'].values)
test_reply_vocab = get_vocab(df_test['map_demojize_reply'].values)
print("test text unique vocab count is: {}".format(len(test_text_vocab)))
print("test reply unique vocab count is: {}".format(len(test_reply_vocab)))
print()

unknown_text = check_coverage(test_text_vocab, roberta_vocab)
print()
unknown_reply = check_coverage(test_reply_vocab, roberta_vocab)

test text unique vocab count is: 12695
test reply unique vocab count is: 4086



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=12695.0, style=ProgressStyle(description…

Found embeddings for 28.452% (3612 / 12695) of vocab
Found embeddings for 81.799% (83612 / 102217) of all text



HBox(children=(FloatProgress(value=0.0, description='Checking: ', max=4086.0, style=ProgressStyle(description_…

Found embeddings for 41.997% (1716 / 4086) of vocab
Found embeddings for 81.206% (14682 / 18080) of all text


In [122]:
sorted(unknown_text.items(), key=lambda d: d[1], reverse=True) 

[('COVID', 158),
 ('hug', 121),
 ('smiling', 104),
 ('crying', 83),
 ('quarantine', 68),
 ('tonight', 64),
 ('tears', 63),
 ('loudly', 61),
 ('myself', 58),
 ('Biden', 52),
 ('reaction', 48),
 ('tomorrow', 47),
 ('pleading', 44),
 ('fucking', 43),
 ('Pelosi', 42),
 ('guys', 40),
 ('Nancy', 40),
 ('hear', 39),
 ('Oz', 39),
 ('anyone', 39),
 ('believe', 37),
 ('trying', 36),
 ('followers', 36),
 ('tweet', 36),
 ('lockdown', 36),
 ('least', 35),
 ('hearts', 35),
 ('birthday', 34),
 ('sad', 34),
 ('hope', 34),
 ('anymore', 33),
 ('economy', 31),
 ('whole', 30),
 ('virus', 29),
 ('already', 28),
 ('laughing', 27),
 ('pandemic', 27),
 ('hashtag', 26),
 ('beautiful', 26),
 ('dinner', 26),
 ('pensive', 26),
 ('decide', 25),
 ('wish', 25),
 ('weekend', 24),
 ('BailoutHumansNow', 24),
 ('politician', 24),
 ('dependable', 24),
 ('forget', 24),
 ('spend', 24),
 ('ScottyFromMarketing', 23),
 ('honest', 23),
 ('career', 23),
 ('happen', 23),
 ('dick', 22),
 ('explain', 22),
 ('reopen', 21),
 ('amaze

## Store preprocessed results

In [115]:
df_train.head()

Unnamed: 0,idx,text,reply,categories,mp4,map_punc_text,map_punc_reply,map_more_punc_text,map_more_punc_reply,map_demojize_text,map_demojize_reply
0,0,we can all agree that any song by Niall Horan.,oui oui,[yes],6dc39e96b11275f064fdaed88273b45e.mp4,we can all agree that any song by Niall Horan .,oui oui,we can all agree that any song by Niall Horan .,oui oui,we can all agree that any song by Niall Horan .,oui oui
1,1,Will you be installing #ScottyFromMarketing's ...,,[no],cfff051f05d8d3b7136c7d58ea6ad55f.mp4,Will you be installing # ScottyFromMarketing ...,,Will you be install # ScottyFromMarketing ' ...,,Will you be install # ScottyFromMarketing ' ...,
2,2,Growing up my mum would call me a Nigga despit...,And he joins in??? Pour some hot grits on em,[smh],bf39e7bd9ad24354ce3ba6822b0104af.mp4,Growing up my mum would call me a Nigga despit...,And he joins in ? ? ? Pour some hot grits o...,Growing up my mum would call me a Nigga despit...,And he joins in ? ? ? Pour some hot grits o...,Growing up my mum would call me a Nigga despit...,And he joins in ? ? ? Pour some hot grits o...
3,3,Rest your head on my chest when the world feel...,😂😂😂😂😂,[wink],173a707a04c277354a2f23cf01d6151e.mp4,Rest your head on my chest when the world feel...,😂😂😂😂😂,Rest your head on my chest when the world feel...,😂😂😂😂😂,Rest your head on my chest when the world feel...,face with tears of joy
4,4,Imagine Will Hernandez and Wills both doing a ...,,[yes],aab6d6bfb0c1382269ddba9b71cc8b7a.mp4,Imagine Will Hernandez and Wills both doing a ...,,Imagine Will Hernandez and Wills both doing a ...,,Imagine Will Hernandez and Wills both doing a ...,


In [116]:
df_dev.head()

Unnamed: 0,idx,text,reply,map_punc_text,map_punc_reply,map_more_punc_text,map_more_punc_reply,map_demojize_text,map_demojize_reply
0,32000,"Drop your cash app, use hashtag #BailoutHumansNow",$tyratomaro #BailoutHumans,"Drop your cash app , use hashtag # BailoutHu...",$ tyratomaro # BailoutHumans,"Drop your cash app , use hashtag # BailoutHu...",$ tyratomaro # BailoutHumans,"Drop your cash app , use hashtag # BailoutHu...",$ tyratomaro # BailoutHumans
1,32001,After interviewing with a few incredible peopl...,CONGRATS!!!!!,After interviewing with a few incredible peopl...,CONGRATS ! ! ! ! !,After interviewing with a few incredible peopl...,CONGRATS ! ! ! ! !,After interviewing with a few incredible peopl...,CONGRATS ! ! ! ! !
2,32002,I know GTC festival not happening next month b...,,I know GTC festival not happening next month b...,,I know GTC festival not happening next month b...,,I know GTC festival not happening next month b...,
3,32003,"Lordy, my daughter just said, “I wonder how th...",,"Lordy , my daughter just said , "" I wonde...",,"Lordy , my daughter just said , "" I wonde...",,"Lordy , my daughter just said , "" I wonde...",
4,32004,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...,THE UNEMPLOYMENT CLAIM SYSTEM SUCKS SO MUCH DICK,Watching everyone else get their weekly unempl...


In [117]:
df_test.head()

Unnamed: 0,idx,text,reply,map_punc_text,map_punc_reply,map_more_punc_text,map_more_punc_reply,map_demojize_text,map_demojize_reply
0,36000,@Youngdeji_ I think if uzi and carti dropping ...,,@ Youngdeji - I think if uzi and carti dro...,,@ Youngdeji - I think if uzi and carti dro...,,@ Youngdeji - I think if uzi and carti dro...,
1,36001,For the third year in a row we are discussing ...,,For the third year in a row we are discussing ...,,For the third year in a row we are discussing ...,,For the third year in a row we are discussing ...,
2,36002,dababy album sounds like it was made for nigga...,That's why you bought it.,dababy album sounds like it was made for nigga...,That ' s why you bought it .,dababy album sounds like it was made for nigga...,That ' s why you buy it .,dababy album sounds like it was made for nigga...,That ' s why you buy it .
3,36003,Majority of Indians do not watch any sport oth...,@ZairaWasimmm got a great story because of the...,Majority of Indians do not watch any sport oth...,@ ZairaWasimmm got a great story because of t...,Majority of Indians do not watch any sport oth...,@ ZairaWasimmm got a great story because of t...,Majority of Indians do not watch any sport oth...,@ ZairaWasimmm got a great story because of t...
4,36004,everybody is just now listening to @madisonbee...,,everybody is just now listening to @ madisonb...,,everybody is just now listen to @ madisonbeer...,,everybody is just now listen to @ madisonbeer...,


## Output preprocessed to json

In [124]:
df_preprocessed = df_train[['idx', 'map_more_punc_text', 'map_more_punc_reply', 'categories']].copy()
df_preprocessed.columns = ['idx', 'text', 'reply', 'categories']
df_preprocessed.to_json('./preprocessed/preprocess_train.json', orient='records', lines=True)

In [125]:
df_preprocessed_dev = df_dev[['idx', 'map_more_punc_text', 'map_more_punc_reply']].copy()
df_preprocessed_dev.columns = ['idx', 'text', 'reply']
df_preprocessed_dev.to_json('./preprocessed/preprocess_dev.json', orient='records', lines=True)

In [126]:
df_preprocessed_test = df_test[['idx', 'map_more_punc_text', 'map_more_punc_reply']].copy()
df_preprocessed_test.columns = ['idx', 'text', 'reply']
df_preprocessed_test.to_json('./preprocessed/preprocess_test.json', orient='records', lines=True)

### Don't have time to test, so won't use in testing phase

In [None]:
df_preprocessed = df_train[['idx', 'map_demojize_text', 'map_demojize_reply', 'categories']].copy()
df_preprocessed.columns = ['idx', 'text', 'reply', 'categories']
df_preprocessed.to_json('./preprocessed/preprocess_train.json', orient='records', lines=True)

In [None]:
df_preprocessed_dev = df_dev[['idx', 'map_demojize_text', 'map_demojize_reply']].copy()
df_preprocessed_dev.columns = ['idx', 'text', 'reply']
df_preprocessed_dev.to_json('./preprocessed/preprocess_dev.json', orient='records', lines=True)

In [None]:
df_preprocessed_test = df_test[['idx', 'map_demojize_text', 'map_demojize_reply']].copy()
df_preprocessed_test.columns = ['idx', 'text', 'reply']
df_preprocessed_test.to_json('./preprocessed/preprocess_test.json', orient='records', lines=True)