# Table of Content

 - [Package Loading](#package)
 - [Parameter](#parameter)
 - [Utility](#util)
 - [Data Prepare](#data)
 - [Embedding](#embedding)
 - [Architecture](#architecture)
 - [Model](#model)
 - [Ensembling](#ensembling)
 - [Evaluation](#evaluate)
 - [Submit](#submit)

<a id='package'></a>
## Package Loading

In [1]:
%%time
# Packages Loading
import time
import random
import pandas as pd
import numpy as np
import gc
import re
import torch

from torchtext import data
import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torch
import string
from unicodedata import category, name, normalize
import torchtext
import os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
# from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
# from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
# from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
# from keras.layers import *
# from keras.models import *
# from keras.optimizers import Adam
# from keras.models import Model
# from keras.models import Sequential
# from keras import backend as K
# from keras.engine.topology import Layer, InputSpec
# from keras import initializers, regularizers, constraints, optimizers, layers
# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from torch.optim.optimizer import Optimizer
from unidecode import unidecode

Using TensorFlow backend.


CPU times: user 1.97 s, sys: 284 ms, total: 2.25 s
Wall time: 4.45 s


<a id='parameter'></a>
## Parameter

In [2]:
#######################Parameters##################################################################
params =dict(
         embed_size = 300, # how big is each word vector
         max_features = 180000, # how many unique words to use (i.e num rows in embedding vector)
         maxlen = 72, # max number of words in a question to use
         batch_size = 512, # how many samples to process at once
         n_epochs = 5, # how many times to iterate over all samples
         n_splits = 4, # Number of K-fold Splits
         seed = 828
        )

<a id='util'></a>
## Utility

In [3]:
# Determinism
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=params['seed'])

# Threshold Searching
from sklearn.metrics import precision_score, recall_score
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in np.arange(0.2, 0.701, 0.005):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    recall = recall_score(y_true=y_true, y_pred=y_proba > best_threshold)
    precision = precision_score(y_true=y_true, y_pred=y_proba > best_threshold)
    search_result = {'threshold': best_threshold, 'f1': best_score, 'recall': recall, 'precision': precision}
    return search_result

# Parallel
import psutil
from multiprocessing import Pool

num_partitions = 20  # number of partitions to split dataframe
num_cores = psutil.cpu_count()  # number of cores on your machine


# Multiprocessing to reduce the preprocessing time
print('number of cores:', num_cores)
def df_parallelize_run(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df



number of cores: 2


> <a id='data'></a>
## Data Preprocessing

The data is quite messy, there are lots of words that are mispelled, and some special symbols, which can not got corresponding embeddings, so before put into model, we need to clean those mispelled words and clean out the special symbols. Also, those mispelled words and special symbols could have some information, e.g., questions have mispelled words or special symbols would be more possible to be an insincere question, so I also marked them during cleaning up process.

In [4]:
def clean_repeat_words(text):
    text = text.replace("img", "ing")

    text = re.sub(r"(I|i)(I|i)+ng", "ing", text)
    text = re.sub(r"(L|l)(L|l)(L|l)+y", "lly", text)
    text = re.sub(r"(A|a)(A|a)(A|a)+", "a", text)
    text = re.sub(r"(C|c)(C|c)(C|c)+", "cc", text)
    text = re.sub(r"(D|d)(D|d)(D|d)+", "dd", text)
    text = re.sub(r"(E|e)(E|e)(E|e)+", "ee", text)
    text = re.sub(r"(F|f)(F|f)(F|f)+", "ff", text)
    text = re.sub(r"(G|g)(G|g)(G|g)+", "gg", text)
    text = re.sub(r"(I|i)(I|i)(I|i)+", "i", text)
    text = re.sub(r"(K|k)(K|k)(K|k)+", "k", text)
    text = re.sub(r"(L|l)(L|l)(L|l)+", "ll", text)
    text = re.sub(r"(M|m)(M|m)(M|m)+", "mm", text)
    text = re.sub(r"(N|n)(N|n)(N|n)+", "nn", text)
    text = re.sub(r"(O|o)(O|o)(O|o)+", "oo", text)
    text = re.sub(r"(P|p)(P|p)(P|p)+", "pp", text)
    text = re.sub(r"(Q|q)(Q|q)+", "q", text)
    text = re.sub(r"(R|r)(R|r)(R|r)+", "rr", text)
    text = re.sub(r"(S|s)(S|s)(S|s)+", "ss", text)
    text = re.sub(r"(T|t)(T|t)(T|t)+", "tt", text)
    text = re.sub(r"(V|v)(V|v)+", "v", text)
    text = re.sub(r"(Y|y)(Y|y)(Y|y)+", "y", text)
    text = re.sub(r"plzz+", "please", text)
    text = re.sub(r"(Z|z)(Z|z)(Z|z)+", "zz", text)
    return text

In [5]:
mispell_dict1 = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite',
                'travelling': 'traveling', 'counselling': 'counseling',
                'theatre': 'theater', 'cancelled': 'canceled', ' labour ': ' labor ',
                'organisation': 'organization', 'wwii': 'world war 2',
                'citicise': 'criticize', ' youtu  ': ' youtube ', ' qoura ': ' quora ',
                'sallary': 'salary', 'whta': 'what', 'narcisist': 'narcissist',
                'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do',
                ' doi ': ' do I ', 'thebest': 'the best', 'howdoes': 'how does',
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate',
                "mastrubating": 'masturbating', 'pennis': 'penis', 'etherium': 'ethereum',
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': 'nba game',
                '2k18': 'nba game', 'qouta': 'quota', 'upwork': 'up work', 'loy machedo' :' branding coach',
                'loy machedo' :' branding coach', 'gdpr' : 'general data protection regulation',
                'adityanath' : 'indian politician', 'adhaar' : 'aadhaar',
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp',
                'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization', 'pokémon': 'pokemon', 'quorans': 'quora',
                'brexit': 'britich exit', 'cryptocurrencies': 'crypto currencies', 'redmi': 'chinese cellphone',
                'coinbase': 'coin base', 'oneplus': 'chinese cellphone', 'uceed': 'toefl', 'bhakts': 'bhakt',
                'bhakts' : 'bhakt', 'f***' : 'fuck', 'f**k' : 'fuck', 'f**cked' : 'fucked', 'f*#k' : 'fuck',
                'f**krs*' : 'fuckers', 'f*cking' : 'fucking', 'f**king' : 'fucking',
                'boruto' : 'naruto', 'alshamsi' : 'al shamsi', 'fiancé' : 'fiance',
                'dceu' : 'dc extended universe', 'iiest' : 'iit', 'srmjee' : 'joint entrance exam',
                'litecoin' : 'bitcoin', 'unacademy' : 'un academy', 'sjws' : 'social justice warriors',
                'tensorflow' : 'tensor flow', 'lnmiit' : 'iit', 'kavalireddi' : 'analyst',
                'doklam' : 'border between china and india', 'altcoin' : 'bitcoin', 
                'muoet' : 'manipal university online entrance exam', 'vajiram' : 'indian coaching center',
                'nicmar' : 'national institute of construction management and research', 
                'bnbr' : 'be nice be respectful', " e g ": " eg ", " b g ": " bg ", " u s ": " america ", 
                " 9 11 ": " 911 ", "e - mail": "email", " j k ": " jk "}

In [6]:
mispell_dict2={'jewprofits': 'jew profits', 'qmas': 'quality migrant admission scheme', 'casterating': 'castrating', 'kashmiristan': 'kashmir', 'careongo': 'india first and largest online distributor of medicines', 'setya novanto': 'a former indonesian politician', 'testoultra': 'male sexual enhancement supplement', 'rammayana': 'ramayana', 'badaganadu': 'brahmin community that mainly reside in karnataka', 'bitcjes': 'bitches', 'mastubrate': 'masturbate', 'français': 'france', 'adsresses': 'address', 'flemmings': 'flemming', 'intermate': 'inter mating', 'feminisam': 'feminism', 'cuckholdry': 'cuckold', 'niggor': 'black hip-hop and electronic artist', 'narcsissist': 'narcissist', 'genderfluid': 'gender fluid', ' im ': ' i am ', ' dont ': ' do not ', 'qoura': 'quora', 'ethethnicitesnicites': 'ethnicity', 'namit bathla': 'content writer', 'what sapp': 'whatsapp', 'führer': 'fuhrer', 'covfefe': 'coverage', 'accedentitly': 'accidentally', 'cuckerberg': 'zuckerberg', 'transtrenders': 'incredibly disrespectful to real transgender people', 'frozen tamod': 'pornographic website', 'hindians': 'north indian', 'hindian': 'north indian', 'celibatess': 'celibates', 'trimp': 'trump', 'wanket': 'wanker', 'wouldd': 'would', 'arragent': 'arrogant', 'ra - apist': 'rapist', 'idoot': 'idiot', 'gangstalkers': 'gangs talkers', 'toastsexual': 'toast sexual', 'inapropriately': 'inappropriately', 'dumbassess': 'dumbass', 'germanized': 'become german', 'helisexual': 'sexual', 'regilious': 'religious', 'timetraveller': 'time traveller', 'darkwebcrawler': 'dark webcrawler', 'routez': 'route', 'trumpians': 'trump supporters', 'irreputable': 'reputation', 'serieusly': 'seriously', 'anti cipation': 'anticipation', 'microaggression': 'micro aggression', 'afircans': 'africans', 'microapologize': 'micro apologize', 'vishnus': 'vishnu', 'excritment': 'excitement', 'disagreemen': 'disagreement', 'gujratis': 'gujarati', 'gujaratis': 'gujarati', 'ugggggggllly': 'ugly', 'germanity': 'german', 'soyboys': 'cuck men lacking masculine characteristics', 'н': 'h', 'м': 'm', 'ѕ': 's', 'т': 't', 'в': 'b', 'υ': 'u', 'ι': 'i', 'genetilia': 'genitalia', 'r - apist': 'rapist', 'borokabama': 'barack obama', 'arectifier': 'rectifier', 'pettypotus': 'petty potus', 'magibabble': 'magi babble', 'nothinking': 'thinking', 'centimiters': 'centimeters', 'saffronized': 'india, politics, derogatory', 'saffronize': 'india, politics, derogatory', ' incect ': ' insect ', 'weenus': 'elbow skin', 'pakistainies': 'pakistanis', 'goodspeaks': 'good speaks', 'inpregnated': 'in pregnant', 'rapefilms': 'rape films', 'rapiest': 'rapist', 'hatrednesss': 'hatred', 'heightism': 'height discrimination', 'getmy': 'get my', 'onsocial': 'on social', 'worstplatform': 'worst platform', 'platfrom': 'platform', 'instagate': 'instigate', 'loy machedeo': 'person', ' dsire ': ' desire ', 'iservant': 'servant', 'intelliegent': 'intelligent', 'ww 1': ' ww1 ', 'ww 2': ' ww2 ', 'keralapeoples': 'kerala peoples', 'trumpervotes': 'trumper votes', 'fucktrumpet': 'fuck trumpet', 'likebjaish': 'like bjaish', 'likemy': 'like my', 'howlikely': 'how likely', 'disagreementts': 'disagreements', 'disagreementt': 'disagreement', 'meninist': 'male chauvinism', 'feminists': 'feminism supporters', 'ghumendra': 'bhupendra', 'emellishments': 'embellishments', 'settelemen': 'settlement', 'richmencupid': 'rich men dating website', 'gaudry - schost': '', 'ladymen': 'ladyboy', 'hasserment': 'harassment', 'instrumentalizing': 'instrument', 'darskin': 'dark skin', 'balckwemen': 'balck women', 'recommendor': 'recommender', 'wowmen': 'women', 'expertthink': 'expert think', 'whitesplaining': 'white splaining', 'inquoraing': 'inquiring', 'whilemany': 'while many', 'manyother': 'many other', 'involvedinthe': 'involved in the', 'slavetrade': 'slave trade', 'aswell': 'as well', 'fewshowanyremorse': 'few show any remorse', 'trageting': 'targeting', 'getile': 'gentile', 'gujjus': 'derogatory gujarati', 'judisciously': 'judiciously', 'hue mungus': 'feminist bait', 'hugh mungus': 'feminist bait', 'hindustanis': 'hindu', 'virushka': 'great relationships couple', 'exclusinary': 'exclusionary', 'himdus': 'hindus', 'milo yianopolous': 'a british polemicist', 'hidusim': 'hinduism', 'holocaustable': 'holocaust', 'evangilitacal': 'evangelical', 'busscas': 'buscas', 'holocaustal': 'holocaust', 'incestious': 'incestuous', 'tennesseus': 'tennessee', 'gusdur': 'gus dur', 'rpatah - tan eng hwan': 'silsilah', 'reinfectus': 'reinfect', 'pharisaistic': 'pharisaism', 'nuslims': 'muslims', 'taskus': '', 'musims': 'muslims', 'musevi': 'the independence of mexico', ' racious ': 'discrimination expression of racism', 'muslimophobia': 'muslim phobia', 'justyfied': 'justified', 'holocause': 'holocaust', 'musilim': 'muslim', 'misandrous': 'misandry', 'glrous': 'glorious', 'desemated': 'decimated', 'votebanks': 'vote banks', 'parkistan': 'pakistan', 'eurooe': 'europe', 'animlaistic': 'animalistic', 'asiasoid': 'asian', 'congoid': 'congolese', 'inheritantly': 'inherently', 'asianisation': 'becoming asia', 'russosphere': 'russia sphere of influence', 'exmuslims': 'ex muslims', 'discriminatein': 'discrimination', ' hinus ': ' hindus ', 'nibirus': 'nibiru', 'habius - corpus': 'habeas corpus', 'prentious': 'pretentious', 'sussia': 'ancient jewish village', 'moustachess': 'moustaches', 'russions': 'russians', 'yuguslavia': 'yugoslavia', 'atrocitties': 'atrocities', 'muslimophobe': 'muslim phobic', 'fallicious': 'fallacious', 'recussed': 'recursed', '@ usafmonitor': 'usa monitor', 'lustfly': 'lustful', 'canmuslims': 'can muslims', 'journalust': 'journalist', 'digustingly': 'disgustingly', 'harasing': 'harassing', 'greatuncle': 'great uncle', 'drumpf': 'trump', 'rejectes': 'rejected', 'polyagamous': 'polygamous', 'mushlims': 'muslims', 'accusition': 'accusation', 'geniusses': 'geniuses', 'moustachesomething': 'moustache something', 'heineous': 'heinous', 'sapiosexuals': 'sapiosexual', 'sapiosexual': 'sexually attracted to intelligence', 'pansexuals': 'pansexual', 'autosexual': 'auto sexual', 'sexualslutty': 'sexual slutty', 'hetorosexuality': 'hetoro sexuality', 'chinesese': 'chinese', 'pizza gate': 'debunked conspiracy theory', 'countryless': 'having no country', 'muslimare': 'muslim are', 'iphonex': 'iphone', 'lionese': 'lioness', 'marionettist': 'marionettes', 'demonetize': 'demonetized', 'eneyone': 'anyone', 'karonese': 'karo people indonesia', 'minderheid': 'minder worse', 'mainstreamly': 'mainstream', 'contraproductive': 'contra productive', 'diffenky': 'differently', 'abandined': 'abandoned', 'p0 rnstars': 'pornstars', 'overproud': 'over proud', 'cheekboned': 'cheek boned', 'heriones': 'heroines', 'eventhogh': 'even though', 'americanmedicalassoc': 'american medical assoc', 'feelwhen': 'feel when', 'hhhow': 'how', 'reallysemites': 'really semites', 'gamergaye': 'gamersgate', 'manspreading': 'man spreading', 'thammana': 'tamannaah bhatia', 'dogmans': 'dogmas', 'managementskills': 'management skills', 'mangoliod': 'mongoloid', 'geerymandered': 'gerrymandered', 'mandateing': 'man dateing', 'mailwoman': 'mail woman', 'humancoalition': 'human coalition', 'manipullate': 'manipulate', 'everyo0 ne': 'everyone', 'takeove': 'takeover', 'nonchristians': 'non christians', 'goverenments': 'governments', 'govrment': 'government', 'polygomists': 'polygamists', 'demogorgan': 'demogorgon', 'maralago': 'mar-a-lago', 'antibigots': 'anti bigots', 'gouing': 'going', 'muzaffarbad': 'muzaffarabad', 'suchvstupid': 'such stupid', 'apartheidisrael': 'apartheid israel', 'personaltiles': 'personal titles', 'lawyergirlfriend': 'lawyer girl friend', 'northestern': 'northwestern', 'yeardold': 'years old', 'masskiller': 'mass killer', 'southeners': 'southerners', 'unitedstatesian': 'united states', 'peoplekind': 'people kind', 'peoplelike': 'people like', 'countrypeople': 'country people', 'shitpeople': 'shit people', 'trumpology': 'trump ology', 'trumpites': 'trump supporters', 'trumplies': 'trump lies', 'donaldtrumping': 'donald trumping', 'trumpdating': 'trump dating', 'trumpsters': 'trumpeters', 'ciswomen': 'cis women', 'womenizer': 'womanizer', 'pregnantwomen': 'pregnant women', 'autoliker': 'auto liker', 'smelllike': 'smell like', 'autolikers': 'auto likers', 'religiouslike': 'religious like', 'likemail': 'like mail', 'fislike': 'dislike', 'sneakerlike': 'sneaker like', 'like⬇': 'like', 'likelovequotes': 'like lovequotes', 'likelogo': 'like logo', 'sexlike': 'sex like', 'whatwould': 'what would', 'howwould': 'how would', 'manwould': 'man would', 'exservicemen': 'ex servicemen', 'femenism': 'feminism', 'devopment': 'development', 'doccuments': 'documents', 'supplementplatform': 'supplement platform', 'mendatory': 'mandatory', 'moviments': 'movements', 'kremenchuh': 'kremenchug', 'docuements': 'documents', 'determenism': 'determinism', 'envisionment': 'envision ment', 'tricompartmental': 'tri compartmental', 'addmovement': 'add movement', 'mentionong': 'mentioning', 'whichtreatment': 'which treatment', 'repyament': 'repayment', 'insemenated': 'inseminated', 'inverstment': 'investment', 'managemental': 'manage mental', 'inviromental': 'environmental', 'menstrution': 'menstruation', 'indtrument': 'instrument', 'mentenance': 'maintenance', 'fermentqtion': 'fermentation', 'achivenment': 'achievement', 'mismanagements': 'mis managements', 'requriment': 'requirement', 'denomenator': 'denominator', 'drparment': 'department', 'celemente': 'clemente', 'manajement': 'management', 'govermenent': 'government', 'accomplishmments': 'accomplishments', 'rendementry': 'rendement ry', 'repariments': 'departments', 'menstrute': 'menstruate', 'determenistic': 'deterministic', 'resigment': 'resignment', 'selfpayment': 'self payment', 'imrpovement': 'improvement', 'enivironment': 'environment', 'compartmentley': 'compartment', 'augumented': 'augmented', 'parmenent': 'permanent', 'develepoments': 'developments', 'menstrated': 'menstruated', 'phnomenon': 'phenomenon', 'employmment': 'employment', 'menigioma': 'meningioma', 'recrument': 'recrement', 'promenient': 'provenient', 'gonverment': 'government', 'statemment': 'statement', 'recuirement': 'requirement', 'invetsment': 'investment', 'parilment': 'parchment', 'parmently': 'patiently', 'agreementindia': 'agreement india', 'menifesto': 'manifesto', 'accomplsihments': 'accomplishments', 'disangagement': 'disengagement', 'aevelopment': 'development', 'procument': 'procumbent', 'harashment': 'harassment', 'tiannanmen': 'tiananmen', 'commensalisms': 'commensal isms', 'devlelpment': 'development', 'dimensons': 'dimensions', 'recruitment2017': 'recruitment 2017', 'polishment': 'pol ishment', 'commentsafe': 'comment safe', 'meausrements': 'measurements', 'geomentrical': 'geometrical', 'undervelopment': 'undevelopment', 'mensurational': 'mensuration al', 'fanmenow': 'fan menow', 'permenganate': 'permanganate', 'bussinessmen': 'businessmen', 'supertournaments': 'super tournaments', 'permanmently': 'permanently', 'lamenectomy': 'lamnectomy', 'assignmentcanyon': 'assignment canyon', 'adgestment': 'adjustment', 'mentalized': 'metalized', 'docyments': 'documents', 'requairment': 'requirement', 'batsmencould': 'batsmen could', 'argumentetc': 'argument etc', 'enjoiment': 'enjoyment', 'invement': 'movement', 'accompliushments': 'accomplishments', 'regements': 'regiments', 'departmenthow': 'department how', 'aremenian': 'armenian', 'amenclinics': 'amen clinics', 'nonfermented': 'non fermented', 'instumentation': 'instrumentation', 'mentalitiy': 'mentality', ' govermen ': 'goverment', 'underdevelopement': 'under developement', 'parlimentry': 'parliamentary', 'indemenity': 'indemnity', 'inatrumentation': 'instrumentation', 'menedatory': 'mandatory', 'mentiri': 'entire', 'accomploshments': 'accomplishments', 'instrumention': 'instrumentation', 'afvertisements': 'advertisements', 'entitlments': 'entitlements', 'endrosment': 'endorsement', 'improment': 'impriment', 'archaemenid': 'achaemenid', 'replecement': 'replacement', 'placdment': 'placement', 'femenise': 'feminise', 'envinment': 'environment', 'amenitycompany': 'amenity company', 'increaments': 'increments', 'accomplihsments': 'accomplishments', 'manygovernment': 'many government', 'panishments': 'punishments', 'elinment': 'eloinment', 'mendalin': 'mend alin', 'farmention': 'farm ention', 'preincrement': 'pre increment', 'postincrement': 'post increment', 'achviements': 'achievements', 'menditory': 'mandatory', 'emouluments': 'emoluments', 'stonemen': 'stone men', 'menmium': 'medium', 'entaglement': 'entanglement', 'integumen': 'integument', 'harassument': 'harassment', 'retairment': 'retainment', 'enviorement': 'environment', 'tormentous': 'torment ous', 'confiment': 'confident', 'enchroachment': 'encroachment', 'prelimenary': 'preliminary', 'fudamental': 'fundamental', 'instrumenot': 'instrument', 'icrement': 'increment', 'prodimently': 'prominently', 'meniss': 'menise', 'whoimplemented': 'who implemented', 'representment': 'rep resentment', 'startfragment': 'start fragment', 'endfragment': 'end fragment', ' documentarie ': ' documentaries ', 'requriments': 'requirements', 'constitutionaldevelopment': 'constitutional development', 'parlamentarians': 'parliamentarians', 'rumenova': 'rumen ova', 'argruments': 'arguments', 'findamental': 'fundamental', 'totalinvestment': 'total investment', 'gevernment': 'government', 'recmommend': 'recommend', 'appsmoment': 'apps moment', 'menstruual': 'menstrual', 'immplemented': 'implemented', 'engangement': 'engagement', 'invovement': 'involvement', 'returement': 'retirement', 'simentaneously': 'simultaneously', 'accompishments': 'accomplishments', 'menstraution': 'menstruation', 'experimently': 'experiment', 'abdimen': 'abdomen', 'cemenet': 'cement', 'propelment': 'propel ment', 'unamendable': 'un amendable', 'employmentnews': 'employment news', 'lawforcement': 'law forcement', 'menstuating': 'menstruating', 'fevelopment': 'development', 'reglamented': 'reg lamented', 'imrovment': 'improvement', 'recommening': 'recommending', 'sppliment': 'supplement', 'measument': 'measurement', 'reimbrusement': 'reimbursement', 'nutrament': 'nutriment', 'puniahment': 'punishment', 'subligamentous': 'sub ligamentous', 'comlementry': 'complementary', 'reteirement': 'retirement', 'envioronments': 'environments', 'haraasment': 'harassment', 'usagovernment': 'usa government', 'apartmentfinder': 'apartment finder', 'encironment': 'environment', 'metacompartment': 'meta compartment', 'augumentation': 'argumentation', 'dsymenorrhoea': 'dysmenorrhoea', 'nonabandonment': 'non abandonment', 'annoincement': 'announcement', 'menberships': 'memberships', 'gamenights': 'game nights', 'enliightenment': 'enlightenment', 'supplymentry': 'supplementary', 'parlamentary': 'parliamentary', 'duramen': 'dura men', 'hotelmanagement': 'hotel management', 'deartment': 'department', 'treatmentshelp': 'treatments help', 'attirements': 'attire ments', 'amendmending': 'amend mending', 'pseudomeningocele': 'pseudo meningocele', 'intrasegmental': 'intra segmental', 'treatmenent': 'treatment', 'infridgement': 'infringement', 'infringiment': 'infringement', 'recrecommend': 'rec recommend', 'entartaiment': 'entertainment', 'inplementing': 'implementing', 'indemendent': 'independent', 'tremendeous': 'tremendous', 'commencial': 'commercial', 'scomplishments': 'accomplishments', 'emplement': 'implement', 'dimensiondimensions': 'dimension dimensions', 'depolyment': 'deployment', 'conpartment': 'compartment', 'govnments': 'movements', 'menstrat': 'menstruate', 'accompplishments': 'accomplishments', 'enchacement': 'enchancement', 'developmenent': 'development', 'emmenagogues': 'emmenagogue', 'aggeement': 'agreement', 'elementsbond': 'elements bond', 'remenant': 'remnant', 'manamement': 'management', 'dimensonless': 'dimensionless', 'ointmentsointments': 'ointments ointments', 'achiements': 'achievements', 'recurtment': 'recurrent', 'gouverments': 'governments', 'docoment': 'document', 'programmingassignments': 'programming assignments', 'menifest': 'manifest', 'investmentguru': 'investment guru', 'deployements': 'deployments', 'plaement': 'placement', 'perliament': 'parliament', 'femenists': 'feminists', 'ecumencial': 'ecumenical', 'advamcements': 'advancements', 'refundment': 'refund ment', 'settlementtake': 'settlement take', 'mensrooms': 'mens rooms', 'productmanagement': 'product management', 'armenains': 'armenians', 'betweenmanagement': 'between management', 'difigurement': 'disfigurement', 'armenized': 'armenize', 'hurrasement': 'hurra sement', 'mamgement': 'management', 'momuments': 'monuments', 'eauipments': 'equipments', 'managemenet': 'management', 'treetment': 'treatment', 'webdevelopement': 'web developement', 'supplemenary': 'supplementary', 'encironmental': 'environmental', 'understandment': 'understand ment', 'enrollnment': 'enrollment', 'thinkstrategic': 'think strategic', 'thinkinh': 'thinking', 'softthinks': 'soft thinks', 'underthinking': 'under thinking', 'thinksurvey': 'think survey', 'whitelash': 'white lash', 'whiteheds': 'whiteheads', 'whitetning': 'whitening', 'whitegirls': 'white girls', 'whitewalkers': 'white walkers', 'manycountries': 'many countries', 'accomany': 'accompany', 'fromgermany': 'from germany', 'manychat': 'many chat', 'germanyl': 'germany', 'manyness': 'many ness', 'many4': 'many', 'digitizeindia': 'digitize india', 'indiarush': 'india rush', 'indiareads': 'india reads', 'telegraphindia': 'telegraph india', 'southindia': 'south india', 'airindia': 'air india', 'siliconindia': 'silicon india', 'indianleaders': 'indian leaders', 'fundsindia': 'funds india', 'indianarmy': 'indian army', 'technoindia': 'techno india', 'betterindia': 'better india', 'capesindia': 'capes india', 'rigetti': 'ligetti', 'vegetablr': 'vegetable', 'get90': 'get', 'magetta': 'maretta', 'nagetive': 'native', 'isunforgettable': 'is unforgettable', 'get630': 'get 630', 'gadgetpack': 'gadget pack', 'languagetool': 'language tool', 'bugdget': 'budget', 'africaget': 'africa get', 'abnegetive': 'abnegative', 'orangetheory': 'orange theory', 'getsmuggled': 'get smuggled', 'avegeta': 'ave geta', 'gettubg': 'getting', 'gadgetsnow': 'gadgets now', 'surgetank': 'surge tank', 'gadagets': 'gadgets', 'getallparts': 'get allparts', 'messenget': 'messenger', 'vegetarean': 'vegetarian', 'get1000': 'get 1000', 'getfinancing': 'get financing', 'getdrip': 'get drip', 'adstargets': 'ads targets', 'tgethr': 'together', 'vegetaries': 'vegetables', 'forgetfulnes': 'forgetfulness', 'fisgeting': 'fidgeting', 'budgetair': 'budget air', 'getdepersonalization': 'get depersonalization', 'negetively': 'negatively', 'gettibg': 'getting', 'nauget': 'naught', 'bugetti': 'bugatti', 'plagetum': 'plage tum', 'vegetabale': 'vegetable', 'changetip': 'change tip', 'blackwashing': 'black washing', 'blackpink': 'black pink', 'blackmoney': 'black money', 'blackmarks': 'black marks', 'blackbeauty': 'black beauty', 'unblacklisted': 'un blacklisted', 'blackdotes': 'black dotes', 'blackboxing': 'black boxing', 'blackpaper': 'black paper', 'blackpower': 'black power', 'latinamericans': 'latin americans', 'musigma': 'mu sigma', 'usict': 'ussct', 'indominus': 'in dominus', 'plus5': 'plus', 'russiagate': 'russia gate', 'russophobic': 'russophobiac', 'radijus': 'radius', 'cobustion': 'combustion', 'austrialians': 'australians', 'mylogenous': 'myogenous', 'raddus': 'radius', 'hetrogenous': 'heterogenous', 'greenhouseeffect': 'greenhouse effect', 'aquous': 'aqueous', 'taharrush': 'tahar rush', 'senousa': 'venous', 'cityairbus': 'city airbus', 'sponteneously': 'spontaneously', 'trustless': 't rustless', 'fusanosuke': 'fu sanosuke', 'isthmuses': 'isthmus es', 'lucideus': 'lucidum', 'overjustification': 'over justification', 'bindusar': 'bind usar', 'cousera': 'online education platform', 'musturbation': 'masturbation', 'infustry': 'industry', 'huswifery': 'a poem', 'rombous': 'bombous', 'disengenuously': 'disingenuously', 'sllybus': 'syllabus', 'celcious': 'delicious', 'cellsius': 'celsius', 'lethocerus': 'lethocerus', 'monogmous': 'monogamous', 'ballyrumpus': 'bally rumpus', 'koushika': 'koushika', 'vivipoarous': 'viviparous', 'ludiculous': 'ridiculous', 'sychronous': 'synchronous', 'industiry': 'industry', 'scuduse': 'scud use', 'babymust': 'baby must', 'simultqneously': 'simultaneously', 'exust': 'ex ust', 'notmusing': 'not musing', 'zamusu': 'amuse', 'tusaki': 'tu saki', 'marrakush': 'marrakesh', 'justcheaptickets': 'just cheaptickets', 'ayahusca': 'ayahausca', 'samousa': 'samosa', 'gusenberg': 'gutenberg', 'illustratuons': 'illustrations', 'extemporeneous': 'extemporaneous', 'mathusla': 'mathusala', 'confundus': 'con fundus', 'tusts': 'trusts', 'poisenious': 'poisonous', 'mevius': 'medius', 'inuslating': 'insulating', 'aroused21000': 'aroused 21000', 'wenzeslaus': 'wenceslaus', 'justinkase': 'justin kase', 'purushottampur': 'purushottam pur', 'citruspay': 'citrus pay', 'secutus': 'sects', 'austentic': 'austenitic', 'faceplusplus': 'face plusplus', 'aysnchronous': 'asynchronous', 'teamtreehouse': 'team treehouse', 'uncouncious': 'unconscious', 'priebuss': 'prie buss', 'consciousuness': 'consciousness', 'susubsoil': 'su subsoil', 'trimegistus': 'trismegistus', 'protopeterous': 'protopterous', 'trustworhty': 'trustworthy', 'ushually': 'usually', 'industris': 'industries', 'instantneous': 'instantaneous', 'superplus': 'super plus', 'shrusti': 'shruti', 'hindhus': 'hindus', 'outonomous': 'autonomous', 'reliegious': 'religious', 'kousakis': 'kou sakis', 'reusult': 'result', 'janusgraph': 'janus graph', 'palusami': 'palus ami', 'mussraff': 'muss raff', 'hukous': 'humous', 'photoacoustics': 'photo acoustics', 'kushanas': 'kusha nas', 'justdile': 'justice', 'massahusetts': 'massachusetts', 'uspset': 'upset', 'sustinet': 'sustinent', 'consicious': 'conscious', 'sadhgurus': 'sadh gurus', 'hystericus': 'hysteric us', 'visahouse': 'visa house', 'supersynchronous': 'super synchronous', 'posinous': 'rosinous', 'fernbus': 'fern bus', 'tiltbrush': 'tilt brush', 'glueteus': 'gluteus', 'posionus': 'poisons', 'freus': 'frees', 'zhuchengtyrannus': 'zhucheng tyrannus', 'savonious': 'sanious', 'cusjo': 'cusco', 'congusion': 'confusion', 'dejavus': 'dejavu s', 'uncosious': 'uncopious', 'previius': 'previous', 'counciousness': 'conciousness', 'lustorus': 'lustrous', 'sllyabus': 'syllabus', 'mousquitoes': 'mosquitoes', 'savvius': 'savvies', 'arceius': 'arcesius', 'prejusticed': 'prejudiced', 'requsitioned': 'requisitioned', 'deindustralization': 'deindustrialization', 'muscleblaze': 'muscle blaze', 'consciousx5': 'conscious', 'nitrogenious': 'nitrogenous', 'mauritious': 'mauritius', 'rigrously': 'rigorously', 'yutyrannus': 'yu tyrannus', 'muscualr': 'muscular', 'conscoiusness': 'consciousness', 'causians': 'crusians', 'workfusion': 'work fusion', 'puspak': 'pu spak', 'inspirus': 'inspires', 'illiustrations': 'illustrations', 'nobushi': 'no bushi', 'theuseof': 'thereof', 'suspicius': 'suspicious', 'intuous': 'virtuous', 'gaushalas': 'gaus halas', 'campusthrough': 'campus through', 'seriousity': 'seriosity', 'resustence': 'resistence', 'geminatus': 'geminates', 'disquss': 'discuss', 'nicholus': 'nicholas', 'husnai': 'hussar', 'diiscuss': 'discuss', 'diffussion': 'diffusion', 'phusicist': 'physicist', 'ernomous': 'enormous', 'khushali': 'khushal i', 'heitus': 'leitus', 'cracksbecause': 'cracks because', 'nautlius': 'nautilus', 'trausted': 'trusted', 'dardandus': 'dardanus', 'megatapirus': 'mega tapirus', 'clusture': 'culture', 'vairamuthus': 'vairamuthu s', 'disclousre': 'disclosure', 'industrilaization': 'industrialization', 'musilms': 'muslims', 'australia9': 'australian', 'causinng': 'causing', 'ibdustries': 'industries', 'searious': 'serious', 'coolmuster': 'cool muster', 'sissyphus': 'sisyphus', ' justificatio ': 'justification', 'antihindus': 'anti hindus', 'moduslink': 'modus link', 'zymogenous': 'zymogen ous', 'prospeorus': 'prosperous', 'retrocausality': 'retro causality', 'fusiongps': 'fusion gps', 'mouseflow': 'mouse flow', 'bootyplus': 'booty plus', 'itylus': 'i tylus', 'olnhausen': 'olshausen', 'suspeect': 'suspect', 'entusiasta': 'enthusiast', 'fecetious': 'facetious', 'bussiest': 'fussiest', 'draconius': 'draconis', 'requsite': 'requisite', 'nauseatic': 'nausea tic', 'brusssels': 'brussels', 'repurcussion': 'repercussion', 'jeisus': 'jesus', 'philanderous': 'philander ous', 'muslisms': 'muslims', 'august2017': 'august 2017', 'calccalculus': 'calc calculus', 'unanonymously': 'un anonymously', 'imaprtus': 'impetus', 'carnivorus': 'carnivorous', 'corypheus': 'coryphees', 'austronauts': 'astronauts', 'neucleus': 'nucleus', 'housepoor': 'house poor', 'rescouses': 'responses', 'tagushi': 'tagus hi', 'hyperfocusing': 'hyper focusing', 'nutriteous': 'nutritious', 'chylus': 'chylous', 'preussure': 'pressure', 'outfocus': 'out focus', 'hanfus': 'hannus', 'rustyrose': 'rusty rose', 'vibhushant': 'vibhushan t', 'conciousnes': 'conciousness', 'venus25': 'venus', 'sedataious': 'seditious', 'promuslim': 'pro muslim', 'statusguru': 'status guru', 'yousician': 'musician', 'transgenus': 'trans genus', 'pushbullet': 'push bullet', 'jeesyllabus': 'jee syllabus', 'complusary': 'compulsory', 'holocoust': 'holocaust', 'careerplus': 'career plus', 'lllustrate': 'illustrate', 'musino': 'musion', 'phinneus': 'phineus', 'usedtoo': 'used too', 'justbasic': 'just basic', 'webmusic': 'web music', 'trustkit': 'trust kit', 'industrzgies': 'industries', 'rubustness': 'robustness', 'missuses': 'miss uses', 'bustees': 'bus tees', 'justyfy': 'justify', 'pegusus': 'pegasus', 'industrybuying': 'industry buying', 'advantegeous': 'advantageous', 'kotatsus': 'kotatsu s', 'justcreated': 'just created', 'simultameously': 'simultaneously', 'husoone': 'huso one', 'twiceusing': 'twice using', 'cetusplay': 'cetus play', 'sqamous': 'squamous', 'claustophobic': 'claustrophobic', 'kaushika': 'kaushik a', 'dioestrus': 'di oestrus', 'degenerous': 'de generous', 'neculeus': 'nucleus', 'cutaneously': 'cu taneously', 'alamotyrannus': 'alamo tyrannus', 'ivanious': 'avanious', 'arceous': 'araceous', 'flixbus': 'flix bus', 'caausing': 'causing', 'publious': 'publius', 'juilus': 'julius', 'australianism': 'australian ism', 'vetronus': 'verrons', 'nonspontaneous': 'non spontaneous', 'calcalus': 'calculus', 'commudus': 'commodus', 'rheusus': 'rhesus', 'syallubus': 'syllabus', 'qurush': 'qu rush', 'athiust': 'athirst', 'conclusionless': 'conclusion less', 'usertesting': 'user testing', 'redius': 'radius', 'austrolia': 'australia', 'sllaybus': 'syllabus', 'toponymous': 'top onymous', 'businiss': 'business', 'hyperthalamus': 'hyper thalamus', 'clause55': 'clause', 'cosicous': 'conscious', 'sushena': 'saphena', 'luscinus': 'luscious', 'prussophile': 'russophile', 'jeaslous': 'jealous', 'austrelia': 'australia', 'contiguious': 'contiguous', 'subconsciousnesses': 'sub consciousnesses', ' jusification ': 'justification', 'dilusion': 'delusion', 'anticoncussive': 'anti concussive', 'disngush': 'disgust', 'constiously': 'consciously', 'filabustering': 'filibustering', 'gapbuster': 'gap buster', 'insectivourous': 'insectivorous', 'glocuse': 'louse', 'antritrust': 'antitrust', 'thisaustralian': 'this australian', 'fusiondrive': 'fusion drive', 'nuclus': 'nucleus', 'abussive': 'abusive', 'mustang1': 'mustangs', 'inradius': 'in radius', 'polonious': 'polonius', 'ofkulbhushan': 'of kulbhushan', 'homosporous': 'homos porous', 'circumradius': 'circum radius', 'atlous': 'atrous', 'insustry': 'industry', 'campuswith': 'campus with', 'beacsuse': 'because', 'concuous': 'conscious', 'nonhindus': 'non hindus', 'carnivourous': 'carnivorous', 'tradeplus': 'trade plus', 'jeruselam': 'jerusalem', 'musuclar': 'muscular', 'deangerous': 'dangerous', 'disscused': 'discussed', 'industdial': 'industrial', 'sallatious': 'fallacious', 'rohmbus': 'rhombus', 'golusu': 'gol usu', 'minangkabaus': 'minangkabau s', 'mustansiriyah': 'mustansiriya h', 'anomymously': 'anonymously', 'abonymously': 'anonymously', 'indrustry': 'industry', 'musharrf': 'musharraf', 'workouses': 'workhouses', 'sponataneously': 'spontaneously', 'anmuslim': 'an muslim', 'syallbus': 'syllabus', 'presumptuousnes': 'presumptuousness', 'thaedus': 'thaddus', 'industey': 'industry', 'hkust': 'hust', 'kousseri': 'kousser i', 'mousestats': 'mouses tats', 'simantaneously': 'simultaneously', 'austertana': 'auster tana', 'infussions': 'infusions', 'coclusion': 'conclusion', 'sustainabke': 'sustainable', 'tusami': 'tu sami', 'anonimously': 'anonymously', 'usebase': 'use base', 'balanoglossus': 'balanoglossus', 'unglaus': 'ung laus', 'ignoramouses': 'ignoramuses', 'snuus': 'snugs', 'reusibility': 'reusability', 'straussianism': 'straussian ism', 'simoultaneously': 'simultaneously', 'realbonus': 'real bonus', 'nuchakus': 'nunchakus', 'annonimous': 'anonymous', 'manuscriptology': 'manuscript ology', 'difusse': 'diffuse', 'pliosaurus': 'pliosaur us', 'cushelle': 'cush elle', 'catallus': 'catullus', 'confousing': 'confusing', 'enthusiasmless': 'enthusiasm less', 'tetherusd': 'tethered', 'josephius': 'josephus', 'jusrlt': 'just', 'simutaneusly': 'simultaneously', 'mountaneous': 'mountainous', 'badonicus': 'sardonicus', 'muccus': 'mucous', 'nicus': 'nidus', 'austinlizards': 'austin lizards', 'errounously': 'erroneously', 'australua': 'australia', 'sylaabus': 'syllabus', 'dusyant': 'distant', 'javadiscussion': 'java discussion', 'megabuses': 'mega buses', 'danergous': 'dangerous', 'contestious': 'contentious', 'exause': 'excuse', 'muscluar': 'muscular', 'avacous': 'vacuous', 'ingenhousz': 'ingenious', 'holocausting': 'holocaust ing', 'pakustan': 'pakistan', 'purusharthas': 'purushartha', 'bapus': 'bapu s', 'useul': 'useful', 'pretenious': 'pretentious', 'homogeneus': 'homogeneous', 'bhlushes': 'blushes', 'saggittarius': 'sagittarius', 'sportsusa': 'sports usa', 'kerataconus': 'keratoconus', 'infrctuous': 'infectuous', 'anonoymous': 'anonymous', 'ridicjlously': 'ridiculously', 'worldbusiness': 'world business', 'hollcaust': 'holocaust', 'dusra': 'dura', 'meritious': 'meritorious', 'sauskes': 'causes', 'inudustry': 'industry', 'frustratd': 'frustrate', 'hypotenous': 'hypogenous', 'dushasana': 'dush asana', 'saadus': 'status', 'keratokonus': 'keratoconus', 'jarrus': 'harrus', 'neuseous': 'nauseous', 'simutanously': 'simultaneously', 'diphosphorus': 'di phosphorus', 'sulprus': 'surplus', 'hasidus': 'hasid us', 'suspenive': 'suspensive', 'illlustrator': 'illustrator', 'userflows': 'user flows', 'intrusivethoughts': 'intrusive thoughts', 'countinous': 'continuous', 'gpusli': 'gusli', 'calculus1': 'calculus', 'bushiri': 'bushire', 'torvosaurus': 'torosaurus', 'chestbusters': 'chest busters', 'satannus': 'sat annus', 'falaxious': 'fallacious', 'obnxious': 'obnoxious', 'tranfusions': 'transfusions', 'playmagnus': 'play magnus', 'epicodus': 'episodes', 'hypercubus': 'hypercubes', 'programmebecause': 'programme because', 'indiginious': 'indigenous', 'housban': 'housman', 'iusso': 'kusso', 'annilingus': 'anilingus', 'nennus': 'genius', 'pussboy': 'puss boy', 'hindusthanis': 'hindustanis', 'lndustrial': 'industrial', 'tyrannously': 'tyrannous', 'susanoomon': 'susanoo mon', 'colmbus': 'columbus', 'sussessful': 'successful', 'ousmania': 'ous mania', 'ilustrating': 'illustrating', 'famousbirthdays': 'famous birthdays', 'suspectance': 'suspect ance', 'extroneous': 'extraneous', 'teethbrush': 'teeth brush', 'abcmouse': 'abc mouse', 'doesgauss': 'does gauss', 'insipudus': 'insipidus', 'movielush': 'movie lush', 'rustichello': 'rustic hello', 'firdausiya': 'firdausi ya', 'checkusers': 'check users', 'householdware': 'household ware', 'prosporously': 'prosperously', 'stelouse': 'ste louse', 'obfuscaton': 'obfuscation', 'amorphus': 'amorph us', 'trustworhy': 'trustworthy', 'celsious': 'cesious', 'dangorous': 'dangerous', 'anticancerous': 'anti cancerous', 'cousi ': 'cousin ', 'austroloid': 'australoid', 'fergussion': 'percussion', 'andkyokushin': 'and kyokushin', 'cousan': 'cousin', 'huskystar': 'hu skystar', 'retrovisus': 'retrovirus', 'becausr': 'because', 'jerusalsem': 'jerusalem', 'motorious': 'notorious', 'industrilised': 'industrialised', 'powerballsusa': 'powerballs usa', 'monoceious': 'monoecious', 'batteriesplus': 'batteries plus', 'nonviscuous': 'nonviscous', 'industion': 'induction', 'bussinss': 'bussings', 'userbags': 'user bags', 'jlius': 'julius', 'thausand': 'thousand', 'plustwo': 'plus two', 'defpush': 'def push', 'subconcussive': 'sub concussive', 'muslium': 'muslim', 'industrilization': 'industrialization', 'maurititus': 'mauritius', 'uslme': 'some', 'susgaon': 'surgeon', 'pantherous': 'panther ous', 'antivirius': 'antivirus', 'trustclix': 'trust clix', 'silumtaneously': 'simultaneously', 'icompus': 'corpus', 'atonomous': 'autonomous', 'reveuse': 'reve use', 'legumnous': 'leguminous', 'syllaybus': 'syllabus', 'louspeaker': 'loudspeaker', 'susbtraction': 'substraction', 'virituous': 'virtuous', 'disastrius': 'disastrous', 'jerussalem': 'jerusalem', 'industrailzed': 'industrialized', 'recusion': 'recushion', 'simultenously': 'simultaneously', 'pulphus': 'pulpous', 'harbaceous': 'herbaceous', 'phlegmonous': 'phlegmon ous', 'use38': 'use', 'jusify': 'justify', 'instatanously': 'instantaneously', 'tetramerous': 'tetramer ous', 'usedvin': 'used vin', 'sagittarious': 'sagittarius', 'mausturbate': 'masturbate', 'subcautaneous': 'subcutaneous', 'dangergrous': 'dangerous', 'sylabbus': 'syllabus', 'hetorozygous': 'heterozygous', 'ignasius': 'ignacius', 'businessbor': 'business bor', 'bhushi': 'thushi', 'moussolini': 'mussolini', 'usucaption': 'usu caption', 'customzation': 'customization', 'cretinously': 'cretinous', 'genuiuses': 'geniuses', 'moushmee': 'mousmee', 'neigous': 'nervous', 'infrustructre': 'infrastructure', 'ilusha': 'ilesha', 'suconciously': 'unconciously', 'stusy': 'study', 'mustectomy': 'mastectomy', 'farmhousebistro': 'farmhouse bistro', 'instantanous': 'instantaneous', 'justforex': 'just forex', 'indusyry': 'industry', 'mustabating': 'must abating', 'uninstrusive': 'unintrusive', 'customshoes': 'customs hoes', 'homageneous': 'homogeneous', 'empericus': 'imperious', 'demisexuality': 'demi sexuality', 'transexualism': 'transsexualism', 'sexualises': 'sexualise', 'demisexuals': 'demisexual', 'sexuly': 'sexily', 'pornosexuality': 'porno sexuality', 'sexond': 'second', 'sexxual': 'sexual', 'asexaul': 'asexual', 'sextactic': 'sex tactic', 'sexualityism': 'sexuality ism', 'monosexuality': 'mono sexuality', 'intwrsex': 'intersex', 'hypersexualize': 'hyper sexualize', 'homosexualtiy': 'homosexuality', 'examsexams': 'exams exams', 'sexmates': 'sex mates', 'sexyjobs': 'sexy jobs', 'sexitest': 'sexiest', 'fraysexual': 'fray sexual', 'sexsurrogates': 'sex surrogates', 'sexuallly': 'sexually', 'gamersexual': 'gamer sexual', 'greysexual': 'grey sexual', 'omnisexuality': 'omni sexuality', 'hetereosexual': 'heterosexual', 'productsexamples': 'products examples', 'sexgods': 'sex gods', 'semisexual': 'semi sexual', 'homosexulity': 'homosexuality', 'sexeverytime': 'sex everytime', 'neurosexist': 'neuro sexist', 'worldquant': 'world quant', 'freshersworld': 'freshers world', 'smartworld': 'sm artworld', 'mistworlds': 'mist worlds', 'boothworld': 'booth world', 'ecoworld': 'eco world', 'underworldly': 'under worldly', 'worldrank': 'world rank', 'clearworld': 'clear world', 'rimworld': 'rim world', 'cryptoworld': 'crypto world', 'machineworld': 'machine world', 'worldwideley': 'worldwide ley', 'capuletwant': 'capulet want', 'bhagwanti': 'bhagwant i', 'unwanted72': 'unwanted 72', 'wantrank': 'want rank', 'willhappen': 'will happen', 'thateasily': 'that easily', 'whatevidence': 'what evidence', 'metaphosphates': 'meta phosphates', 'exilarchate': 'exilarch ate', 'aulphate': 'sulphate', 'whateducation': 'what education', 'persulphates': 'per sulphates', 'disulphate': 'di sulphate', 'picosulphate': 'pico sulphate', 'tetraosulphate': 'tetrao sulphate', 'prechinese': 'pre chinese', 'hellochinese': 'hello chinese', 'muchdeveloped': 'much developed', 'stomuch': 'stomach', 'whatmakes': 'what makes', 'lensmaker': 'lens maker', 'eyemake': 'eye make', 'techmakers': 'tech makers', 'cakemaker': 'cake maker', 'makeup411': 'makeup 411', 'objectmake': 'object make', 'crazymaker': 'crazy maker', 'makedonian': 'macedonian', 'makeschool': 'make school', 'anxietymake': 'anxiety make', 'makeshifter': 'make shifter', 'countryball': 'country ball', 'whichcountry': 'which country', 'countryhow': 'country how', 'zenfone': 'zen fone', 'electroneum': 'electro neum', 'demonetisation': 'demonetization', 'onecoin': 'one coin', 'demonetizing': 'demonetized', 'iphone7': 'iphone 7', 'iphone6': 'iphone', 'microneedling': 'micro needling', 'monegasques': 'monegasque s', 'demonetised': 'demonetized', 'everyonediestm': 'everyonedies tm', 'teststerone': 'testosterone', 'donedone': 'done done', 'papermoney': 'paper money', 'sasabone': 'sasa bone', 'blackphone': 'black phone', 'bonechiller': 'bone chiller', 'moneyfront': 'money front', 'workdone': 'work done', 'roxycodone': 'r oxycodone', 'moneycard': 'money card', 'fantocone': 'fantocine', 'eletronegativity': 'electronegativity', 'mellophones': 'mellophone s', 'isotones': 'iso tones', 'donesnt': 'doesnt', 'thereanyone': 'there anyone', 'electronegativty': 'electronegativity', 'commissiioned': 'commissioned', 'earvphone': 'earphone', 'condtioners': 'conditioners', 'demonetistaion': 'demonetization', 'ballonets': 'ballo nets', 'doneclaim': 'done claim', 'alimoney': 'alimony', 'iodopovidone': 'iodo povidone', 'bonesetters': 'bone setters', 'componendo': 'compon endo', 'probationees': 'probationers', 'one300': 'one 300', 'nonelectrolyte': 'non electrolyte', 'ozonedepletion': 'ozone depletion', 'stonehart': 'stone hart', 'vodafone2': 'vodafones', 'chaparone': 'chaperone', 'noonein': 'noo nein', 'frosione': 'erosion', 'pentanone': 'penta none', 'poneglyphs': 'pone glyphs', 'cyclohexenone': 'cyclohexanone', 'marlstone': 'marls tone', 'androneda': 'andromeda', 'iphone8': 'iphone', 'acidtone': 'acid tone', 'noneconomically': 'non economically', 'honeyfund': 'honey fund', 'germanophone': 'germanophobe', 'democratizationed': 'democratization ed', 'haoneymoon': 'honeymoon', 'someonewith': 'some onewith', 'hexanone': 'hexa none', 'bonespur': 'bones pur', 'sisterzoned': 'sister zoned', 'hasanyone': 'has anyone', 'stonepelters': 'stone pelters', 'chronexia': 'chronaxia', 'brotherzone': 'brother zone', 'brotherzoned': 'brother zoned', 'fonecare': 'f onecare', 'nonexsistence': 'nonexistence', 'conents': 'contents', 'phonecases': 'phone cases', 'commissionerates': 'commissioner ates', 'activemoney': 'active money', 'dingtone': 'ding tone', 'wheatestone': 'wheatstone', 'chiropractorone': 'chiropractor one', 'heeadphones': 'headphones', 'maimonedes': 'maimonides', 'onepiecedeals': 'onepiece deals', 'oneblade': 'one blade', 'venetioned': 'venetianed', 'sunnyleone': 'sunny leone', 'prendisone': 'prednisone', 'anglosaxophone': 'anglo saxophone', 'blackphones': 'black phones', 'jionee': 'jinnee', 'chromonema': 'chromo nema', 'iodoketones': 'iodo ketones', 'demonetizations': 'demonetization', 'aomeone': 'someone', 'trillonere': 'trillones', 'abandonee': 'abandon', 'mastercolonel': 'master colonel', 'fronend': 'friend', 'wildstone': 'wilds tone', 'patitioned': 'petitioned', 'lonewolfs': 'lone wolfs', 'spectrastone': 'spectra stone', 'dishonerable': 'dishonorable', 'poisiones': 'poisons', 'condioner': 'conditioner', 'unpermissioned': 'unper missioned', 'friedzone': 'fried zone', 'umumoney': 'umu money', 'anyonestudied': 'anyone studied', 'dictioneries': 'dictionaries', 'nosebone': 'nose bone', 'ofvodafone': 'of vodafone', 'yumstone': 'yum stone', 'oxandrolonesteroid': 'oxandrolone steroid', 'mifeprostone': 'mifepristone', 'pheramones': 'pheromones', 'sinophone': 'sinophobe', 'peloponesian': 'peloponnesian', 'michrophone': 'microphone', 'commissionets': 'commissioners', 'methedone': 'methadone', 'cobditioners': 'conditioners', 'urotone': 'protone', 'smarthpone': 'smartphone', 'conectu': 'connect you', 'beloney': 'boloney', 'comfortzone': 'comfort zone', 'testostersone': 'testosterone', 'camponente': 'component', 'idonesia': 'indonesia', 'dolostones': 'dolostone', 'psiphone': 'psi phone', 'ceftriazone': 'ceftriaxone', 'feelonely': 'feel onely', 'monetation': 'moderation', 'activationenergy': 'activation energy', 'moneydriven': 'money driven', 'staionery': 'stationery', 'zoneflex': 'zone flex', 'moneycash': 'money cash', 'conectiin': 'connection', 'wannaone': 'wanna one', 'pictones': 'pict ones', 'demonentization': 'demonetization', 'phenonenon': 'phenomenon', 'evenafter': 'even after', 'sevenfriday': 'seven friday', 'devendale': 'evendale', 'theeventchronicle': 'the event chronicle', 'seventysomething': 'seventy something', 'sevenpointed': 'seven pointed', 'richfeel': 'rich feel', 'overfeel': 'over feel', 'feelingstupid': 'feeling stupid', 'photofeeler': 'photo feeler', 'feelomgs': 'feelings', 'feelinfs': 'feelings', 'playerunknown': 'player unknown', 'knowlefge': 'knowledge', 'knowledgd': 'knowledge', 'knowledeg': 'knowledge', 'knowble': 'knowle', 'howknow': 'howk now', 'knowledgewoods': 'knowledge woods', 'knownprogramming': 'known programming', 'selfknowledge': 'self knowledge', 'knowldage': 'knowledge', 'knowyouve': 'know youve', 'aknowlege': 'knowledge', 'audetteknown': 'audette known', 'knowlegdeable': 'knowledgeable', 'trueoutside': 'true outside', 'saynthesize': 'synthesize', 'essaytyper': 'essay typer', 'meesaya': 'mee saya', 'rasayanam': 'rasayan am', 'fanessay': 'fan essay', 'momsays': 'moms ays', 'sayying': 'saying', 'saydaw': 'say daw', 'theyreally': 'they really', 'gayifying': 'gayed up with homosexual love', 'gayke': 'gay online retailers', 'lingayatism': 'lingayat', 'macapugay': 'macaulay', 'jewsplain': 'jews plain', 'banggood': 'bang good', 'goodfriends': 'good friends', 'goodfirms': 'good firms', 'dogooder': 'do gooder', 'stillshots': 'stills hots', 'stillsuits': 'still suits', 'panromantic': 'pan romantic', 'paracommando': 'para commando', 'romantize': 'romanize', 'manupulative': 'manipulative', 'manjha': 'mania', 'mankrit': 'mank rit', 'heteroromantic': 'hetero romantic', 'pulmanery': 'pulmonary', 'manpads': 'man pads', 'supermaneuverable': 'super maneuverable', 'mandatkry': 'mandatory', 'armanents': 'armaments', 'manipative': 'mancipative', 'himanity': 'humanity', 'maneuever': 'maneuver', 'kumarmangalam': 'kumar mangalam', 'brahmanwadi': 'brahman wadi', 'exserviceman': 'ex serviceman', 'managewp': 'managed', 'manies': 'many', 'recordermans': 'recorder mans', 'feymann': 'heymann', 'salemmango': 'salem mango', 'manufraturing': 'manufacturing', 'sreeman': 'freeman', 'tamanaa': 'tamanac', 'chlamydomanas': 'chlamydomonas', 'comandant': 'commandant', 'huemanity': 'humanity', 'manaagerial': 'managerial', 'lithromantics': 'lith romantics', 'geramans': 'germans', 'nagamandala': 'naga mandala', 'humanitariarism': 'humanitarianism', 'wattman': 'watt man', 'salesmanago': 'salesman ago', 'washwoman': 'wash woman', 'rammandir': 'ram mandir', 'nomanclature': 'nomenclature', 'haufman': 'kaufman', 'prefomance': 'performance', 'ramanunjan': 'ramanujan', 'freemansonry': 'freemasonry', 'supermaneuverability': 'super maneuverability', 'manstruate': 'menstruate', 'tarumanagara': 'taruma nagara', 'romancetale': 'romance tale', 'heteromantic': 'hete romantic', 'terimanals': 'terminals', 'womansplaining': 'feminist', 'performancelearning': 'performance learning', 'sociomantic': 'sciomantic', 'batmanvoice': 'batman voice', 'performancetesting': 'performance testing', 'manorialism': 'manorial ism', 'newscommando': 'news commando', 'entwicklungsroman': 'entwicklungs roman', 'kunstlerroman': 'kunstler roman', 'bodhidharman': 'bodhidharma', 'howmaney': 'how many', 'manufucturing': 'manufacturing', 'remmaning': 'remaining', 'rangeman': 'range man', 'mythomaniac': 'mythomania', 'katgmandu': 'katmandu', 'superowoman': 'superwoman', 'rahmanland': 'rahman land', 'dormmanu': 'dormant', 'geftman': 'gentman', 'manufacturig': 'manufacturing', 'bramanistic': 'brahmanistic', 'padmanabhanagar': 'padmanabhan agar', 'homoromantic': 'homo romantic', 'femanists': 'feminists', 'demihuman': 'demi human', 'manrega': 'manresa', 'pasmanda': 'pas manda', 'manufacctured': 'manufactured', 'remaninder': 'remainder', 'marimanga': 'mari manga', 'sloatman': 'sloat man', 'manlet': 'man let', 'perfoemance': 'performance', 'mangolian': 'mongolian', 'mangekyu': 'mange kyu', 'mansatory': 'mandatory', 'managemebt': 'management', 'manufctures': 'manufactures', 'bramanical': 'brahmanical', 'manaufacturing': 'manufacturing', 'lakhsman': 'lakhs man', 'sarumans': 'sarum ans', 'mangalasutra': 'mangalsutra', 'germanised': 'german ised', 'managersworking': 'managers working', 'cammando': 'commando', 'mandrillaris': 'mandrill aris', 'emmanvel': 'emmarvel', 'manupalation': 'manipulation', 'welcomeromanian': 'welcome romanian', 'humanfemale': 'human female', 'mankirt': 'mankind', 'haffmann': 'hoffmann', 'demantion': 'detention', 'suparwoman': 'superwoman', 'parasuramans': 'parasuram ans', 'sulmann': 'suilmann', 'shubman': 'subman', 'manspread': 'man spread', 'mandingan': 'mandingan', 'mandalikalu': 'mandalika lu', 'manufraturer': 'manufacturer', 'wedgieman': 'wedgie man', 'manwues': 'manages', 'humanzees': 'human zees', 'steymann': 'stedmann', 'jobberman': 'jobber man', 'maniquins': 'mani quins', 'biromantical': 'bi romantical', 'rovman': 'roman', 'pyromantic': 'pyro mantic', 'tastaman': 'rastaman', 'spoolman': 'spool man', 'subramaniyan': 'subramani yan', 'abhimana': 'hinduism', 'manholding': 'man holding', 'seviceman': 'serviceman', 'womansplained': 'womans plained', 'manniya': 'mania', 'bhraman': 'braman', 'laakman': 'layman', 'mansturbate': 'masturbate', 'sulamaniya': 'sulamani ya', 'demanters': 'decanters', 'postmanare': 'postman are', 'rstman': 'rotman', 'permanentjobs': 'permanent jobs', 'allmang': 'all mang', 'tradecommander': 'trade commander', 'basedstickman': 'based stickman', 'deshabhimani': 'desha bhimani', 'manslamming': 'mans lamming', 'brahmanwad': 'brahman wad', 'fundemantally': 'fundamentally', 'supplemantary': 'supplementary', 'egomanias': 'ego manias', 'manvantar': 'manvantara', 'spymania': 'spy mania', 'mangonada': 'mango nada', 'manthras': 'mantras', 'humanpark': 'human park', 'manhuas': 'mahuas', 'manterrupting': 'interrupting', 'dermatillomaniac': 'dermatillomania', 'performancies': 'performances', 'manipulant': 'manipulate', 'painterman': 'painter man', 'mangalik': 'manglik', 'neurosemantics': 'neuro semantics', 'discrimantion': 'discrimination', 'mongodump': 'mongo dump', 'roadgods': 'road gods', 'oligodendraglioma': 'oligodendroglioma', 'janewright': 'jane wright', ' righten ': ' tighten ', 'brightiest': 'brightest', 'frighter': 'fighter', 'righteouness': 'righteousness', 'triangleright': 'triangle right', 'brightspace': 'brights pace', 'techinacal': 'technical', 'chinawares': 'china wares', 'vancouever': 'vancouver', 'cheverlet': 'cheveret', 'deverstion': 'diversion', 'everbodys': 'everybody', 'dramafever': 'drama fever', 'reverificaton': 'reverification', 'canterlever': 'canter lever', 'keywordseverywhere': 'keywords everywhere', 'neverunlearned': 'never unlearned', 'everyfirst': 'every first', 'neverhteless': 'nevertheless', 'clevercoyote': 'clever coyote', 'irrevershible': 'irreversible', 'achievership': 'achievers hip', 'easedeverything': 'eased everything', 'youbever': 'you bever', 'everperson': 'ever person', 'everydsy': 'everyday', 'whemever': 'whenever', 'everyonr': 'everyone', 'severiity': 'severity', 'narracist': 'narcissist', 'racistly': 'racist', 'takesuch': 'take such', 'mystakenly': 'mistakenly', 'shouldntake': 'shouldnt take', 'kalitake': 'kali take', 'msitake': 'mistake', 'straitstimes': 'straits times', 'timefram': 'timeframe', 'watchtime': 'watch time', 'timetraveling': 'timet raveling', 'peactime': 'peacetime', 'timetabe': 'timetable', 'cooktime': 'cook time', 'blocktime': 'block time', 'timesjobs': 'times jobs', 'timesence': 'times ence', 'touchtime': 'touch time', 'timeloop': 'time loop', 'subcentimeter': 'sub centimeter', 'timejobs': 'time jobs', 'guardtime': 'guard time', 'realtimepolitics': 'realtime politics', 'loadingtimes': 'loading times', 'timesnow': '24-hour english news channel in india', 'timesspark': 'times spark', 'timetravelling': 'timet ravelling', 'antimeter': 'anti meter', 'timewaste': 'time waste', 'cryptochristians': 'crypto christians', 'whatcould': 'what could', 'becomesdouble': 'becomes double', 'deathbecomes': 'death becomes', 'youbecome': 'you become', 'greenseer': 'people who possess the magical ability', 'rseearch': 'research', 'homeseek': 'home seek', 'starseeders': 'star seeders', 'seekingmillionaire': 'seeking millionaire', 'see\u202c': 'see', 'seeies': 'series', 'codeagon': 'code agon', 'royago': 'royal', 'dragonkeeper': 'dragon keeper', 'mcgreggor': 'mcgregor', 'catrgory': 'category', 'dragonknight': 'dragon knight', 'antergos': 'anteros', 'togofogo': 'togo fogo', 'mongorestore': 'mongo restore', 'gorgops': 'gorgons', 'withgoogle': 'with google', 'goundar': 'gondar', 'algorthmic': 'algorithmic', 'goatnuts': 'goat nuts', 'vitilgo': 'vitiligo', 'polygony': 'poly gony', 'digonals': 'diagonals', 'luxemgourg': 'luxembourg', 'ucsandiego': 'uc sandiego', 'ringostat': 'ringo stat', 'takingoff': 'taking off', 'mongoimport': 'mongo import', 'alggorithms': 'algorithms', 'negotiatior': 'negotiation', 'gomovies': 'go movies', 'withgott': 'without', 'categoried': 'categories', 'stocklogos': 'stock logos', 'pedogogical': 'pedological', 'wedugo': 'wedge', 'golddig': 'gold dig', 'goldengroup': 'golden group', 'merrigo': 'merligo', 'googlemapsapi': 'googlemaps api', 'goldmedal': 'gold medal', 'golemized': 'polemized', 'caligornia': 'california', 'unergonomic': 'un ergonomic', 'faegon': 'wagon', 'vertigos': 'vertigo s', 'trigonomatry': 'trigonometry', 'hypogonadic': 'hypogonadia', 'mogolia': 'mongolia', 'governmaent': 'government', 'ergotherapy': 'ergo therapy', 'bogosort': 'bogo sort', 'goalwise': 'goal wise', 'alogorithms': 'algorithms', 'mercadopago': 'mercado pago', 'rivigo': 'technology-enabled logistics company', 'govshutdown': 'gov shutdown', 'gorlfriend': 'girlfriend', 'stategovt': 'state govt', 'chickengonia': 'chicken gonia', 'yegorovich': 'yegorov ich', 'regognitions': 'recognitions', 'gorichen': 'gori chen mountain', 'goegraphies': 'geographies', 'gothras': 'goth ras', 'belagola': 'bela gola', 'snapragon': 'snapdragon', 'oogonial': 'oogonia l', 'amigofoods': 'amigo foods', 'sigorn': 'son of styr', 'algorithimic': 'algorithmic', 'innermongolians': 'inner mongolians', 'arangodb': 'arango db', 'zigolo': 'gigolo', 'regognized': 'recognized', 'moongot': 'moong ot', 'goldquest': 'gold quest', 'catagorey': 'category', 'got7': 'got', 'jetbingo': 'jet bingo', 'dragonchain': 'dragon chain', 'catwgorized': 'categorized', 'gogoro': 'gogo ro', 'tobagoans': 'tobago ans', 'digonal': 'diagonal', 'algoritmic': 'algorismic', 'dragonflag': 'dragon flag', 'indigoflight': 'indigo flight', 'governening': 'governing', 'ergosphere': 'ergo sphere', 'pingo5': 'pingo', 'montogo': 'montego', 'jigolo': 'gigolo', 'phythagoras': 'pythagoras', 'forgottenfaster': 'forgotten faster', 'stargold': 'a hindi movie channel', 'googolplexain': 'googolplexian', 'corpgov': 'corperate government', 'govtribe': 'provides real-time federal contracting market intel', 'dragonglass': 'dragon glass', 'gorakpur': 'gorakhpur', 'mangopay': 'mango pay', 'chigoe': 'sub-tropical climates', 'bingobox': 'an investment company', '走go': 'go', 'followingorder': 'following order', 'pangolinminer': 'pangolin miner', 'negosiation': 'negotiation', 'lexigographers': 'lexicographers', 'algorithom': 'algorithm', 'unforgottable': 'unforgettable', 'wellsfargoemail': 'wellsfargo email', 'daigonal': 'diagonal', 'pangoro': 'cantankerous pokemon', 'negotiotions': 'negotiations', 'swissgolden': 'swiss golden', 'google4': 'google', 'agoraki': 'ago raki', 'garthago': 'carthago', 'stegosauri': 'stegosaurus', 'ergophobia': 'ergo phobia', 'bigolive': 'big olive', 'bittergoat': 'bitter goat', 'naggots': 'faggots', 'googology': 'online encyclopedia', 'algortihms': 'algorithms', 'bengolis': 'bengalis', 'fingols': 'finnish people are supposedly descended from mongols', 'savethechildren': 'save thechildren', 'stopings': 'stoping', 'stopsits': 'stop sits', 'stopsigns': 'stop signs', 'galastop': 'galas top', 'pokestops': 'pokes tops', 'forcestop': 'forces top', 'hopstop': 'hops top', 'stoppingexercises': 'stopping exercises', 'coinstop': 'coins top', 'stoppef': 'stopped', 'workaway': 'work away', 'snazzyway': 'snazzy way', 'rewardingways': 'rewarding ways', 'cloudways': 'cloud ways', 'brainsway': 'brains way', 'nesraway': 'nearaway', 'alwayshired': 'always hired', 'expessway': 'expressway', 'syncway': 'sync way', 'leewayhertz': 'blockchain company', 'towayrds': 'towards', 'swayable': 'sway able', 'telloway': 'tello way', 'palsmodium': 'plasmodium', 'gobackmodi': 'goback modi', 'comodies': 'corodies', 'islamphobic': 'islam phobic', 'islamphobia': 'islam phobia', 'citiesbetter': 'cities better', 'betterv3': 'better', 'betterdtu': 'better dtu', 'babadook': 'a horror drama film', 'ahemadabad': 'ahmadabad', 'faidabad': 'faizabad', 'amedabad': 'ahmedabad', 'kabadii': 'kabaddi', 'badmothing': 'badmouthing', 'badminaton': 'badminton', 'badtameezdil': 'badtameez dil', 'badeffects': 'bad effects', '∠bad': 'bad', 'embaded': 'embased', 'isdhanbad': 'is dhanbad', 'badgermoles': 'enormous, blind mammal', 'allhabad': 'allahabad', 'ghazibad': 'ghazi bad', 'htderabad': 'hyderabad', 'auragabad': 'aurangabad', 'ahmedbad': 'ahmedabad', 'ahmdabad': 'ahmadabad', 'alahabad': 'allahabad', 'hydeabad': 'hyderabad', 'gyroglove': 'wearable technology', 'foodlovee': 'food lovee', 'slovenised': 'slovenia', 'handgloves': 'hand gloves', 'lovestep': 'love step', 'lovejihad': 'love jihad', 'rolloverbox': 'rollover box', 'stupidedt': 'stupidest', 'toostupid': 'too stupid', 'pakistanisbeautiful': 'pakistanis beautiful', 'ispakistan': 'is pakistan', 'inpersonations': 'impersonations', 'medicalperson': 'medical person', 'interpersonation': 'inter personation', 'workperson': 'work person', 'personlich': 'person lich', 'persoenlich': 'person lich', 'middleperson': 'middle person', 'personslized': 'personalized', 'personifaction': 'personification', 'welcomemarriage': 'welcome marriage', 'come2': 'come to', 'upcomedians': 'up comedians', 'overvcome': 'overcome', 'talecome': 'tale come', 'cometitive': 'competitive', 'arencome': 'aren come', 'achecomes': 'ache comes', '」come': 'come', 'comepleted': 'completed', 'overcomeanxieties': 'overcome anxieties', 'demigirl': 'demi girl', 'gridgirl': 'female models of the race', 'halfgirlfriend': 'half girlfriend', 'girlriend': 'girlfriend', 'fitgirl': 'fit girl', 'girlfrnd': 'girlfriend', 'awrong': 'aw rong', 'northcap': 'north cap', 'productionsupport': 'production support', 'designbold': 'online photo editor design studio', 'skyhold': 'sky hold', 'shuoldnt': 'shouldnt', 'anarold': 'android', 'yaerold': 'year old', 'soldiders': 'soldiers', 'indrold': 'android', 'blindfoldedly': 'blindfolded', 'overcold': 'over cold', 'goldmont': 'microarchitecture in intel', 'boldspot': 'bolds pot', 'rankholders': 'rank holders', 'cooldrink': 'cool drink', 'beltholders': 'belt holders', 'goldendict': 'open-source dictionary program', 'softskill': 'softs kill', 'cooldige': 'the 30th president of the united states', 'newkiller': 'new killer', 'skillselect': 'skills elect', 'nonskilled': 'non skilled', 'killyou': 'kill you', 'skillport': 'army e-learning program', 'unkilled': 'un killed', 'killikng': 'killing', 'killograms': 'kilograms', 'worldkillers': 'world killers', 'reskilled': 'skilled', 'killedshivaji': 'killed shivaji', 'honorkillings': 'honor killings', 'skillclasses': 'skill classes', 'microskills': 'micros kills', 'ratkill': 'rat kill', 'pleasegive': 'please give', 'flashgive': 'flash give', 'southerntelescope': 'southern telescope', 'westsouth': 'west south', 'southafricans': 'south africans', 'joboutlooks': 'job outlooks', 'joboutlook': 'job outlook', 'outlook365': 'outlook 365', 'neulife': 'neu life', 'qualifeid': 'qualified', 'nullifed': 'nullified', 'lifeaffect': 'life affect', 'lifestly': 'lifestyle', 'aristocracylifestyle': 'aristocracy lifestyle', 'antilife': 'anti life', 'afterafterlife': 'after afterlife', 'lifestylye': 'lifestyle', 'prelife': 'pre life', 'lifeute': 'life ute', 'liferature': 'literature', 'securedlife': 'secured life', 'doublelife': 'double life', 'antireligion': 'anti religion', 'coreligionist': 'co religionist', 'petrostates': 'petro states', 'otherstates': 'others tates', 'spacewithout': 'space without', 'withoutyou': 'without you', 'withoutregistered': 'without registered', 'weightwithout': 'weight without', 'withoutcheck': 'without check', 'milkwithout': 'milk without', 'highschoold': 'high school', 'memoney': 'money', 'moneyof': 'mony of', 'oneplus': 'chinese smartphone manufacturer', 'beerus': 'the god of destruction', 'takeoverr': 'takeover', 'demonetizedd': 'demonetized', 'polyhouse': 'polytunnel', 'elitmus': 'indian company that helps companies in hiring employees', 'becone': 'become', 'nestaway': 'nest away', 'takeoverrs': 'takeovers', 'istop': 'i stop', 'austira': 'australia', 'germeny': 'germany', 'mansoon': 'man soon', 'worldmax': 'wholesaler of drum parts', 'ammusement': 'amusement', 'manyare': 'many are', 'supplymentary': 'supply mentary', 'timesup': 'times up', 'homologus': 'homologous', 'uimovement': 'ui movement', 'spause': 'spouse', 'aesexual': 'asexual', 'iovercome': 'i overcome', 'developmeny': 'development', 'hindusm': 'hinduism', 'sexpat': 'sex tourism', 'sunstop': 'sun stop', 'polyhouses': 'polytunnel', 'usefl': 'useful', 'fundamantal': 'fundamental', 'environmentai': 'environmental', 'redmi': 'xiaomi mobile', 'loy machedo': ' motivational speaker ', 'boruto': 'naruto next generations', 'upwork': 'up work', 'unacademy': 'educational technology company', 'hackerrank': 'hacker rank', 'chromecast': 'chrome cast', 'microservices': 'micro services', 'undertale': 'video game', 'undergraduation': 'under graduation', 'chapterwise': 'chapter wise', 'twinflame': 'twin flame', 'hotstar': 'hot star', 'blockchains': 'blockchain', 'darkweb': 'dark web', 'nearbuy': 'nearby', ' padmaavat ': ' padmavati ', ' padmavat ': ' padmavati ', ' padmaavati ': ' padmavati ', ' internshala ': ' internship and online training platform in india ', 'dream11': ' fantasy sports platform in india ', 'conciousnesss': 'consciousnesses', 'cointry': 'country', ' coinvest ': ' invest ', '23 andme': 'privately held personal genomics and biotechnology company in california', 'trumpism': 'philosophy and politics espoused by donald trump', 'trumpian': 'viewpoints of president donald trump', 'trumpists': 'admirer of donald trump', 'coincidents': 'coincidence', 'coinsized': 'coin sized', 'coincedences': 'coincidences', 'cointries': 'countries', 'coinsidered': 'considered', 'coinfirm': 'confirm', 'humilates': 'humiliates', 'vicevice': 'vice vice', 'politicak': 'political', 'sumaterans': 'sumatrans', 'kamikazis': 'kamikazes', 'unmoraled': 'unmoral', 'eduacated': 'educated', 'moraled': 'morale', 'amharc': 'amarc', 'where burkhas': 'wear burqas', 'baloochistan': 'balochistan', 'durgahs': 'durgans', 'illigitmate': 'illegitimate', 'hillum': 'helium', 'treatens': 'threatens', 'mutiliating': 'mutilating', 'speakingly': 'speaking', 'pretex': 'pretext', 'menstruateion': 'menstruation', 'genocidizing': 'genociding', 'maratis': 'maratism', 'parkistinian': 'pakistani', 'speicial': 'special', 'refernece': 'reference', 'provocates': 'provokes', 'faminazis': 'feminazis', 'repugicans': 'republicans', 'tonogenesis': 'tone', 'winor': 'win', 'redicules': 'ridiculous', 'beluchistan': 'balochistan', 'volime': 'volume', 'namaj': 'namaz', 'congressi': 'congress', 'ashifa': 'asifa', 'queffing': 'queefing', 'montheistic': 'nontheistic', 'rajsthan': 'rajasthan', 'rajsthanis': 'rajasthanis', 'specrum': 'spectrum', 'brophytes': 'bryophytes', 'adhaar': 'adhara', 'slogun': 'slogan', 'harassd': 'harassed', 'transness': 'trans gender', 'insdians': 'indians', 'trampaphobia': 'trump aphobia', 'attrected': 'attracted', 'yahtzees': 'yahtzee', 'thiests': 'atheists', 'thrir': 'their', 'extraterestrial': 'extraterrestrial', 'silghtest': 'slightest', 'primarty': 'primary', 'brlieve': 'believe', 'fondels': 'fondles', 'loundly': 'loudly', 'bootythongs': 'booty thongs', 'understamding': 'understanding', 'degenarate': 'degenerate', 'narsistic': 'narcistic', 'innerskin': 'inner skin', 'spectulated': 'speculated', 'hippocratical': 'hippocratical', 'itstead': 'instead', 'parralels': 'parallels', 'sloppers': 'slippers','terroristan': 'terrorist pakistan', 'fatf': 'western summit conference', 'bimaru': 'bimaru bihar, madhya pradesh, rajasthan, uttar pradesh', 'hinduphobic': 'hindu phobic', 'hinduphobia': 'hindu phobic', 'babchenko': 'arkady arkadyevich babchenko faked death', 'boshniaks': 'bosniaks', 'dravidanadu': 'dravida nadu', 'mysoginists': 'misogynists', 'mgtows': 'men going their own way', 'mongloid': 'mongoloid', 'unsincere': 'insincere', 'meninism': 'male feminism', 'jewplicate': 'jewish replicate', 'jewplicates': 'jewish replicate', 'andhbhakts': 'and bhakt', 'unoin': 'union', 'daesh': 'islamic state of iraq and the levant', 'burnol': 'movement about modi', 'kalergi': 'coudenhove-kalergi', 'bhakts': 'bhakt', 'tambrahms': 'tamil brahmin', 'pahul': 'amrit sanskar', 'sjw': 'social justice warrior', 'sjws': 'social justice warrior', ' incel': ' involuntary celibates', ' incels': ' involuntary celibates', 'emiratis': 'emiratis', 'weatern': 'western', 'westernise': 'westernize', 'pizzagate': 'debunked conspiracy theory', 'naïve': 'naive', 'skripal': 'russian military officer', 'skripals': 'russian military officer', 'remainers': 'british remainer', 'novichok': 'soviet union agents', 'gauri lankesh': 'famous indian journalist', 'castroists': 'castro supporters', 'bremainer': 'british remainer', 'antibrahmin': 'anti brahminism', 'hypsm': ' harvard, yale, princeton, stanford, mit', 'hyps': ' harvard, yale, princeton, stanford', 'kompromat': 'compromising material', 'tharki': 'pervert', 'mastuburate': 'masturbate', 'zoë': 'zoe', 'indans': 'indian', ' xender': ' gender', 'naxali ': 'naxalite ', 'naxalities': 'naxalites', 'bathla': 'namit bathla', 'mewani': 'indian politician jignesh mevani', 'wjy': 'why', 'fadnavis': 'indian politician devendra fadnavis', 'awadesh': 'indian engineer awdhesh singh', 'awdhesh': 'indian engineer awdhesh singh', 'khalistanis': 'sikh separatist movement', 'madheshi': 'madheshi', 'bnbr': 'be nice, be respectful', 'jair bolsonaro': 'brazilian president politician', 'xxxtentacion': 'tentacion', 'slavoj zizek': 'slovenian philosopher', 'borderliners': 'borderlines', 'brexit': 'british exit', 'brexiter': 'british exit supporter', 'brexiters': 'british exit supporters', 'brexiteer': 'british exit supporter', 'brexiteers': 'british exit supporters', 'brexiting': 'british exit', 'brexitosis': 'british exit disorder', 'jallikattu': 'jallikattu', 'fortnite': 'fortnite', 'swachh': 'swachh bharat mission campaign ', 'quorans': 'quora users', 'qoura': 'quora', 'quoras': 'quora', 'quroa': 'quora', 'quora': 'quora', 'stupead': 'stupid', 'narcissit': 'narcissist', 'trigger nometry': 'trigonometry', 'trigglypuff': 'student criticism of conservatives', 'peoplelook': 'people look', 'paedophelia': 'paedophilia', 'uogi': 'yogi', 'adityanath': 'adityanath', 'yogi adityanath': 'indian monk and hindu nationalist politician', 'awdhesh singh': 'commissioner of india', 'doklam': 'tibet', 'drumpf ': 'donald trump fool ', 'drumpfs': 'donald trump fools', 'strzok': 'hillary clinton scandal', 'rohingya': 'rohingya ', ' wumao ': ' cheap chinese stuff ', 'wumaos': 'cheap chinese stuff', 'sanghis': 'sanghi', 'tamilans': 'tamils', 'biharis': 'biharis', 'rejuvalex': 'hair growth formula medicine', 'fekuchand': 'pm narendra modi in india', 'feku': 'pm narendra modi in india ', 'chaiwala': 'tea seller in india', 'deplorables': 'deplorable', 'muhajirs': 'muslim immigrant', 'gujratis': 'gujarati', 'chutiya': 'tibet people ', 'chutiyas': 'tibet people ', 'thighing': 'masterbate between the legs of a female infant', '卐': 'nazi germany', 'pribumi': 'native indonesian', 'gurmehar': 'gurmehar kaur indian student activist', 'khazari': 'khazars', 'demonetization': 'demonetization', 'demonetisation': 'demonetization', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'antinationals': 'antinational', 'cryptocurrencies': 'cryptocurrency', 'hindians': 'north indian', 'hindian': 'north indian', 'vaxxer': 'vocal nationalist ', 'remoaner': 'remainer ', 'bremoaner': 'british remainer ', 'jewism': 'judaism', 'eroupian': 'european', "j & k dy cm h ' ble kavinderji": '', 'wmaf': 'white male married asian female', 'amwf': 'asian male married white female', 'moeslim': 'muslim', 'cishet': 'cisgender and heterosexual person', 'eurocentrics': 'eurocentrism', 'eurocentric': 'eurocentrism', 'afrocentrics': 'africa centrism', 'afrocentric': 'africa centrism', 'jewdar': 'jew dar', 'marathis': 'marathi', 'gynophobic': 'gyno phobic', 'trumpanzees': 'trump chimpanzee fool', 'crimean': 'crimea people ', 'atrracted': 'attract', 'myeshia': 'widow of green beret killed in niger', 'demcoratic': 'democratic', 'raaping': 'raping', 'feminazism': 'feminism nazi', 'langague': 'language', 'sathyaraj': 'actor', 'hongkongese': 'hongkong people', 'kashmirians': 'kashmirian', 'chodu': 'fucker', 'penish': 'penis', 'chitpavan konkanastha': 'hindu maharashtrian brahmin community', 'madridiots': 'real madrid idiot supporters', 'ambedkarite': 'dalit buddhist movement ', 'releasethememo': 'cry for the right and trump supporters', 'harrase': 'harass', 'barracoon': 'black slave', 'castrater': 'castration', 'rapistan': 'pakistan rapist', 'turkified': 'turkification', 'dumbassistan': 'dumb ass pakistan', 'facetards': 'facebook retards', 'rapefugees': 'rapist refugee', 'khortha': 'language in the indian state of jharkhand', 'magahi': 'language in the northeastern indian', 'bajjika': 'language spoken in eastern india', 'superficious': 'superficial', 'sense8': 'american science fiction drama web television series', 'saipul jamil': 'indonesia artist', 'bhakht': 'bhakti', 'smartia': 'dumb nation', 'absorve': 'absolve', 'citicise': 'criticize', 'youtu ': 'youtube ', 'whta': 'what', 'esspecial': 'especial', 'doi': 'do i', 'thebest': 'the best', 'howdoes': 'how does', 'etherium': 'ethereum', 'qiblas': 'qibla', 'hello4 2 cab': 'online cab booking', 'bodyshame': 'body shaming', 'bodyshoppers': 'body shopping', 'bodycams': 'body cams', 'cananybody': 'can any body', 'deadbody': 'dead body', 'deaddict': 'de addict', 'northindian': 'north indian ', 'northkorea': 'north korea', 'koreaboo': 'korea boo ', 'brexshit': 'british exit bullshit', 'shitpost': 'shit post', 'shitslam': 'shit islam', 'shitlords': 'shit lords', 'fck': 'fuck', 'clickbait': 'click bait ', 'mailbait': 'mail bait', 'healhtcare': 'healthcare', 'trollbots': 'troll bots', 'trollled': 'trolled', 'trollimg': 'trolling', 'cybertrolling': 'cyber trolling', 'sickular': 'india sick secular ', 'idiotism': 'idiotism', 'niggerism': 'nigger', 'niggeriah': 'nigger', ' s.p ': ' ', 'u.s.p': '', 'u.s.a.': 'usa', 'u.s.a': 'usa', 'u.s.': 'usa', ' u.s ': ' usa ', 'fu.k': 'fuck', 'u.k.': 'uk', ' u.k ': ' uk ', ' don t ': ' do not ', 'bacteries': 'batteries', ' yr old ': ' years old ', 'ph.d': 'phd', 'cau.sing': 'causing', 'kim jong-un': 'the president of north korea', 'savegely': 'savagely', 'ra apist': 'rapist', '2fifth': 'twenty fifth', '2third': 'twenty third', '2nineth': 'twenty nineth', '2fourth': 'twenty fourth', '#metoo': 'metoo', 'trumpcare': 'trump health care system', '4fifth': 'forty fifth', 'remainers': 'remainder', 'terroristan': 'terrorist', 'antibrahmin': 'anti brahmin', 'fuckboys': 'fuckboy', 'fuckboy': 'fuckboy', 'fuckgirls': 'fuck girls', 'fuckgirl': 'fuck girl', 'trumpsters': 'trump supporters', '4sixth': 'forty sixth', 'culturr': 'culture', 'weatern': 'western', '4fourth': 'forty fourth', 'emiratis': 'emirates', 'trumpers': 'trumpster', 'indans': 'indians', 'mastuburate': 'masturbate', 'f**k': 'fuck', ' u r ': ' you are ', ' u ': ' you ', '操你妈': 'fuck your mother', 'e.g.': 'for example', 'i.e.': 'in other words', '...': '.', 'et.al': 'elsewhere', 'anti-semitic': 'anti-semitic', 'f***': 'fuck', 'f**': 'fuc', 'a****': 'assho', 'a**': 'ass', 'h***': 'hole', 's***': 'shit', 's**': 'shi', 'sh**': 'shit', 'p****': 'pussy', 'p*ssy': 'pussy', 'p***': 'porn', 'p*rn': 'porn', 'st*up*id': 'stupid', 'd***': 'dick', 'di**': 'dick', 'h*ck': 'hack', 'b*tch': 'bitch', 'bi*ch': 'bitch', 'bit*h': 'bitch', 'bitc*': 'bitch', 'b****': 'bitch', 'b***': 'bitc', 'b**': 'bit', 'b*ll': 'bull'}

In [7]:
misspell_mapping = { **mispell_dict1, **mispell_dict2}

In [8]:
symbols = ['"', ':', ')', '(', '-', '!', '|', ';', "'", '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@',
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '™','tm', '›',
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '●', 'â',
          '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '▓',
          '‹', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆',
          'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '│',
          '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹',
          '≤', '‡', '√', '\u200b', '…', '\ufeff']

In [9]:
def clean_misspell(text, mapping=misspell_mapping):
    mispell=0
    for error in mapping:
        if error in text:
            text = text.replace(error, mapping[error])
            mispell=1
    repeat_words = ['iing', 'llly', 'aaa', 'ccc', 'ddd', 'eee', 'fff', 'ggg', 'iii', 'kkk', 'lll', 'mmm', 'nnn', 'ooo', 'ppp', 'qqq',
 'rrr', 'sss', 'ttt', 'vv', 'yyy', 'plzz', 'zzz']

    for repeat in repeat_words:
        if repeat in text:
            mispell=1
            text = clean_repeat_words(text)
    return text, mispell

def check_symbol(text, mapping=symbols):
    if_symbol=0
    for symbol in mapping:
        if symbol in text:
            if_symbol=1
    return if_symbol

Two features related to capital characters are also added:
 - The number of capital characters in the sentence.
 - If the sentence has a word that made of all Captital characters.

In [10]:
def capitalize_num(text):
    cap_num = sum(1 for c in text.split() if c==c.capitalize())
    return cap_num

def cap_word(text):
    all_cap=0
    for t in text.split():
        if t.isupper():
            all_cap=1
    return all_cap

In [11]:
#######################Data Clean##################################################################
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", 
#                  "€": "e","£": "e",
                 "™": " tm ", "√": " sqrt ", "×": "x", "²": "2",
                 "—": "-", "–": "-", "’": "'",
                 "_": "-", "`": "'", '“': '"',
                 '”': '"', '“': '"',  '∞':
                 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha',
                 '•': '.', 'à': 'a', '−': '-', 'β': 'beta',
                 '∅': '', '³': '3', 'π': 'pi',"—": "-", "–": "-", "_": "-", '”': '"', "″": '"', '“': '"', '•': '.', '−': '-',
                 "’": "'", "‘": "'", "´": "'", "`": "'", '\u200b': ' ', '\xa0': ' ','،':'','„':'',
                 '…': ' ... ', '\ufeff': '', "’":"'", "‘":"'", "´":"'", "`":"'"}


contraction_mapping = {" ain't": " is not", " aren't": " are not"," can't": " cannot",
                       " cause": " because", " could've": "could have", "couldn't": "could not",
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not",
                       "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                       "he'd": "he would","he'll": "he will", "he's": "he is",
                       "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have",
                       "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have",
                       "i'll": "i will",  "i'll've": "i will have","i'm": "i am",
                       "i've": "i have", "isn't": "is not", "it'd": "it would",
                       "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not",
                       "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                       "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
                       "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                       "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
                       "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
                       "she's": "she is", "should've": "should have", "shouldn't": "should not",
                       "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                       "this's": "this is","that'd": "that would", "that'd've": "that would have",
                       "that's": "that is", "there'd": "there would", "there'd've": "there would have",
                       "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
                       "they're": "they are", "they've": "they have", "to've": "to have",
                       "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
                       "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                       "we've": "we have", "weren't": "were not", "what'll": "what will",
                       "what'll've": "what will have", "what're": "what are",  "what's": "what is",
                       "what've": "what have", "when's": "when is", "when've": "when have",
                       "where'd": "where did", "where's": "where is", "where've": "where have",
                       "who'll": "who will", "who'll've": "who will have", "who's": "who is",
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have",
                       "won't": "will not", "won't've": "will not have", "would've": "would have",
                       "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                       "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                       "y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are",
                       "you've": "you have" }

mapping = {**contraction_mapping,**punct_mapping}
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™','tm', '›',
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â',
          '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓',
          '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆',
          'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│',
          '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹',
          '≤', '‡', '√', '\u200b', '…', '\ufeff']



# Glove has 2.19 million words, including captical words
# Para has 1.7 million, not including capital words
# Wiki has 1 million

#########################################Utility Functions#########################################
# Substitute contraction
def clean_contractions(text, mapping=mapping):
    for error in mapping:
        if error in text:
            text = text.replace(error, mapping[error])
#     text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

# Use punctuation to split 
def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

# Turn Numbers to #
def clean_number(text):
    # split characters and number
    if bool(re.search(r'(\d+)([a-zA-Z])', text)):
        text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
        
    if bool(re.search(r'(\d+) (th|st|nd|rd) ', text)):
        text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    
    if bool(re.search(r'(\d+),(\d+)', text)):
        text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    
    return text



# Add features that could be used in the concatenation layer|
def add_features(df):
    
    df['question_text'] = df['question_text'].progress_apply(lambda x:str(x))
    df['total_length'] = df['question_text'].progress_apply(len)
    df['capitals'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    df['symbols'] = df['question_text'].progress_apply(check_symbol)
    df['cap_word'] = df['question_text'].progress_apply(cap_word)
    df['cap_num'] = df['question_text'].progress_apply(capitalize_num)
    df['numbers'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isdigit()))
    
#     df['symbols'] = df['question_text'].progress_apply(lambda x: 1 if )
#     df['quotes'] = df['question_text'].apply(lambda comment: 1 if)

    return df

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

<a id='clean'></a>
## Data Clean

In [12]:
%%time
#########################Data Prepare##################################################################


def data_process(train_df, test_df):

    
################################### Add Features ###################################################
    train = add_features(train_df)
    test = add_features(test_df)

    features = train[['total_length', 'caps_vs_length', 'words_vs_unique', 
                      'num_words', 'cap_num', 'numbers']].fillna(0)
    test_features = test[['total_length', 'caps_vs_length', 'words_vs_unique', 'num_words', 
                          'cap_num', 'numbers']].fillna(0)

    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)

    
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_contractions(x.lower()))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_contractions(x.lower()))
    
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_repeat_words(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_repeat_words(x))
    
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))
    
    pool =Pool(processes=2)
    train_mis = pool.map(clean_misspell,train_df["question_text"])
    test_mis = pool.map(clean_misspell,test_df["question_text"])
    pool.close()
    pool.join()
#     train_mis = train_df["question_text"].apply(lambda x :clean_misspell(x))
#     test_mis = test_df["question_text"].apply(lambda x :clean_misspell(x))
    
    train_df["question_text"], train_df["mispell"] = [t[0] for t in train_mis], [t[1] for t in train_mis]
    test_df["question_text"], test_df["mispell"] = [t[0] for t in test_mis], [t[1] for t in test_mis]
    
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_number(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_number(x))


    features = np.concatenate((features, train[['symbols', 'cap_word', 'mispell']].fillna(0)), axis=1)
    test_features = np.concatenate((test_features, test[['symbols', 'cap_word', 'mispell']].fillna(0)), axis=1)
    # Clean numbers
#     train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_numbers(x))
#     test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("something").values
    test_X = test_df["question_text"].fillna("something").values
    
    tokenizer = Tokenizer(num_words=params['max_features'], filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(list(train_X)+list(test_X))
    
    # Tokenize the sentences
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)
    train_y = train_df['target'].values
    
    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=params['maxlen'])
    test_X = pad_sequences(test_X, maxlen=params['maxlen'])
    
    return train_X, test_X, train_y, features, test_features, tokenizer.word_index



train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
# df = pd.concat([train_df ,test_df],sort=True)

train_X, test_X, train_y, features, test_features, word_index = data_process(train_df, test_df)

HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…






HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…


CPU times: user 5min 51s, sys: 3.57 s, total: 5min 55s
Wall time: 11min 8s


### Save the data for losing connect

In [13]:
## Save Dataset to Disk

np.save("train_X", train_X)
np.save("test_X", test_X)
np.save("train_y", train_y)
np.save("word_index", word_index)
np.save("features", features)
np.save("test_features", test_features)
## Clean Memory

del train_df, test_df
gc.collect()

14

In [14]:
## Load Data from Disk

train_X = np.load("train_X.npy")
test_X = np.load("test_X.npy")
train_y = np.load("train_y.npy")
word_index = np.load("word_index.npy").item()
features = np.load("features.npy")
test_features = np.load("test_features.npy")

<a id='embedding'></a>
## Embedding

In [15]:
%%time

# Define the function for loading different embeddings

def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if o.split(" ")[0] in word_index)
    
#     all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
#     embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(params['max_features'], len(word_index))
#     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, params['embed_size']))
    for word, i in tqdm(word_index.items()):
        if i >= params['max_features']: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
# seed_everything()
# glove_embeddings = load_glove(word_index)
# gc.collect

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100 and o.split(" ")[0] in word_index)

#     all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
#     embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(params['max_features'], len(word_index))
#     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, params['embed_size']))
    for word, i in tqdm(word_index.items()):
        if i >= params['max_features']: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100 and o.split(" ")[0] in word_index)

#     all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0033469985, 0.109855495
#     embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(params['max_features'], len(word_index))
#     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, params['embed_size']))
    for word, i in tqdm(word_index.items()):
        if i >= params['max_features']: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.3 µs


In [16]:
%%time
seed_everything()
glove_embeddings = load_glove(word_index)
fast_embeddings = load_fasttext(word_index)
para_embeddings = load_para(word_index)
gc.collect()

HBox(children=(IntProgress(value=0, max=214615), HTML(value='')))




HBox(children=(IntProgress(value=0, max=214615), HTML(value='')))




HBox(children=(IntProgress(value=0, max=214615), HTML(value='')))


CPU times: user 1min 46s, sys: 6.05 s, total: 1min 52s
Wall time: 1min 52s


In [17]:
mean_embedding_matrix = np.mean([glove_embeddings, para_embeddings, fast_embeddings], axis=0)
concat_all_embedding_matrix = np.concatenate((glove_embeddings, para_embeddings, fast_embeddings), axis=1)
concat_gp_embedding_matrix = np.concatenate((glove_embeddings, para_embeddings), axis=1)
concat_gf_embedding_matrix = np.concatenate((glove_embeddings, fast_embeddings), axis=1)
concat_pf_embedding_matrix = np.concatenate((para_embeddings, fast_embeddings), axis=1)


# vocab = build_vocab(df['question_text'])
# add_lower(embedding_matrix, vocab)
del glove_embeddings, para_embeddings, fast_embeddings
gc.collect()

0

<a id='architecture'></a>
## Architecture

### LSTM+GRU+Feature Concatenating

Since during my experiments, different embeddings have different optimal architectures, so I define several different architectures to train those embeddings. The prediction also benefit from model diversity in this way.

In [18]:
# Define attention block
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [19]:
# Define the first NN 
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        hidden_size = 128
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.01)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
        
        ih = (param.data for name, param in self.gru.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.gru.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.gru.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)
        self.lstm_attention = Attention(hidden_size*2, maxlen)
        self.gru_attention = Attention(hidden_size*2, maxlen)
        
        self.linear = nn.Linear(1024+features.shape[1], 16)
        self.relu = nn.ELU()
        self.dropout = nn.Dropout(0.01)
        
        self.out = nn.Linear(16, 1)
        
    def forward(self, x, y):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 1)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool,y), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        
        return out

In [20]:
# Define the second NN
class NeuralNet2(nn.Module):
    def __init__(self):
        super(NeuralNet2, self).__init__()
        
        hidden_size = 128
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.01)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
        
        ih = (param.data for name, param in self.gru.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.gru.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.gru.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)
        self.lstm_attention = Attention(hidden_size*2, maxlen)
        self.gru_attention = Attention(hidden_size*2, maxlen)
        
        self.linear = nn.Linear(1024+features.shape[1], 16)
        self.relu = nn.ELU()
        self.dropout = nn.Dropout(0.1)
        
        self.out = nn.Linear(16, 1)
        
    def forward(self, x, y):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 1)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool,y), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        
        return out

<a id='model'></a>
## Model

### Data Split

In [21]:
## Split data
seed_everything(seed=params['seed'])
# ## split to train and val
# splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=params['seed']).split(train_X, train_y))

# # get the fold one for testing for quick check
# train_idx, valid_idx = splits[0]
# X_train, X_valid = train_X[train_idx], train_X[valid_idx]
# y_train, y_valid = train_y[train_idx], train_y[valid_idx]
# feature_train, feature_valid = features[train_idx], features[valid_idx]

### Train Function

In [22]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [23]:
x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test_feature = torch.tensor(test_features, dtype=torch.float32).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda, test_feature)
prediction_batch_size = 4096
test_loader = torch.utils.data.DataLoader(test, batch_size=prediction_batch_size, shuffle=False)

train_X = torch.tensor(train_X, dtype=torch.long).cuda()
features = torch.tensor(features, dtype=torch.float32).cuda()
train_y = torch.tensor(train_y[:,np.newaxis], dtype=torch.float32).cuda()

test_preds = np.zeros((len(test_X), 4))

Training based on different embeddings.

In [24]:
def train_model_gp(model, x_train, feature_train, y_train):
    
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001) #Optimizer Setting
    
#     scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3,4], gamma=0.1)
#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.3) #Scheduler Setting
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3000,4000,4500,5000,5500,6000,6500,7000,7500], gamma=0.2)
   
    train = torch.utils.data.TensorDataset(x_train, feature_train, y_train)  #Tranform data to tensor dataset
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) #Create data loader
  
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean',pos_weight = torch.tensor(2)).cuda() #Loss function setting
    
    for epoch in range(n_epochs):  #Epoch Iteration
#         scheduler.step()
        start_time = time.time()
        model.train() #Set model to train mode
        avg_loss = 0.
        
        for x_batch, feature_batch, y_batch in train_loader:
            scheduler.step()
            y_pred = model(x_batch, feature_batch)
            # scheduler.batch_step()      
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()

    test_preds = np.zeros((len(test_loader.dataset)))
    start_time = time.time()
    for i, (x_batch, feature_batch) in enumerate(test_loader):
        y_pred = model(x_batch, feature_batch).detach()

        test_preds[i * prediction_batch_size:(i+1) * prediction_batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    print('prediction time spend:')
    print(time.time()-start_time)
    return test_preds#, test_preds_local

In [25]:
def train_model_gf(model, x_train, feature_train, y_train):
    
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001) #Optimizer Setting
    
#     scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3,4], gamma=0.1)
#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.3) #Scheduler Setting
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[4000, 5000, 5200, 5500,6000,6500,7000,7500], gamma=0.2)
   
    train = torch.utils.data.TensorDataset(x_train, feature_train, y_train)  #Tranform data to tensor dataset
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) #Create data loader
  
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean',pos_weight = torch.tensor(2)).cuda() #Loss function setting
    
    for epoch in range(n_epochs):  #Epoch Iteration
#         scheduler.step()
        start_time = time.time()
        model.train() #Set model to train mode
        avg_loss = 0.
        
        for x_batch, feature_batch, y_batch in train_loader:
            scheduler.step()
            y_pred = model(x_batch, feature_batch)
            # scheduler.batch_step()      
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()

    test_preds = np.zeros((len(test_loader.dataset)))
    start_time = time.time()
    for i, (x_batch, feature_batch) in enumerate(test_loader):
        y_pred = model(x_batch, feature_batch).detach()

        test_preds[i * prediction_batch_size:(i+1) * prediction_batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    print('prediction time spend:')
    print(time.time()-start_time)
    return test_preds#, test_preds_local

In [26]:
def train_model_pf(model, x_train, feature_train, y_train):
    
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001) #Optimizer Setting
    
#     scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3,4], gamma=0.1)
#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.3) #Scheduler Setting
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[4500, 5000, 5200, 5500,6000,6500,7000,7500], gamma=0.2)
   
    train = torch.utils.data.TensorDataset(x_train, feature_train, y_train)  #Tranform data to tensor dataset
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) #Create data loader
  
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean',pos_weight = torch.tensor(2)).cuda() #Loss function setting
    
    for epoch in range(n_epochs):  #Epoch Iteration
#         scheduler.step()
        start_time = time.time()
        model.train() #Set model to train mode
        avg_loss = 0.
        
        for x_batch, feature_batch, y_batch in train_loader:
            scheduler.step()
            y_pred = model(x_batch, feature_batch)
            # scheduler.batch_step()      
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()

    test_preds = np.zeros((len(test_loader.dataset)))
    start_time = time.time()
    for i, (x_batch, feature_batch) in enumerate(test_loader):
        y_pred = model(x_batch, feature_batch).detach()

        test_preds[i * prediction_batch_size:(i+1) * prediction_batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    print('prediction time spend:')
    print(time.time()-start_time)
    return test_preds#, test_preds_local

## Predictions

In [27]:
%%time
n_epochs = 3
batch_size = 768
seed = params['seed']
max_features = params['max_features']
embedding_matrix = concat_gp_embedding_matrix
embed_size = embedding_matrix.shape[1]
maxlen = params['maxlen']

model = NeuralNet()
model.cuda()

test_preds_gp = train_model_gp(model, train_X, features, train_y)

prediction time spend:
14.749350786209106
CPU times: user 2min 48s, sys: 4min 25s, total: 7min 14s
Wall time: 7min 15s


In [28]:
%%time
n_epochs = 3
batch_size = 768
seed = params['seed']
max_features = params['max_features']
embedding_matrix = concat_gf_embedding_matrix
embed_size = embedding_matrix.shape[1]
maxlen = params['maxlen']

model = NeuralNet()
model.cuda()

test_preds_gf = train_model_gf(model, train_X, features, train_y)

prediction time spend:
14.870900392532349
CPU times: user 2min 49s, sys: 4min 27s, total: 7min 16s
Wall time: 7min 16s


In [29]:
%%time
n_epochs = 3
batch_size = 768
seed = params['seed']
max_features = params['max_features']
embedding_matrix = concat_all_embedding_matrix
embed_size = embedding_matrix.shape[1]
maxlen = params['maxlen']

model = NeuralNet()
model.cuda()

test_preds_all = train_model_gf(model, train_X, features, train_y)

prediction time spend:
17.373772144317627
CPU times: user 2min 58s, sys: 5min 22s, total: 8min 20s
Wall time: 8min 20s


In [30]:
%%time
n_epochs = 3
batch_size = 768
seed = params['seed']
max_features = params['max_features']
embedding_matrix = concat_pf_embedding_matrix
embed_size = embedding_matrix.shape[1]
maxlen = params['maxlen']

model = NeuralNet2()
model.cuda()

test_preds_pf = train_model_pf(model, train_X, features, train_y)

prediction time spend:
14.842677116394043
CPU times: user 2min 49s, sys: 4min 24s, total: 7min 13s
Wall time: 7min 14s


## Blending

In [31]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white")

prediction = pd.DataFrame({'gp':test_preds_gp,
                           'gf':test_preds_gf,
                           'pf':test_preds_pf,
                           'a':test_preds_all})

corr = prediction.corr()
corr

Unnamed: 0,gp,gf,pf,a
gp,1.0,0.960229,0.958812,0.961483
gf,0.960229,1.0,0.952854,0.957355
pf,0.958812,0.952854,1.0,0.956949
a,0.961483,0.957355,0.956949,1.0


In [32]:
sub = pd.read_csv('../input/sample_submission.csv')
final_preds = 0.25*test_preds_gp+0.25*test_preds_gf+0.25*test_preds_pf+0.25*test_preds_all
sub['prediction'] = final_preds > 0.53
sub.to_csv("submission.csv", index=False)