This isn't spacy, so we need to do the preprocessing ourselves.

Summarised from: https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing

In [None]:
import pandas as pd
import numpy as np
import operator 
import re

# Cleaning Questions

In [None]:
train = pd.read_csv("../input/train.csv")
df = train
df['lowered_question'] = df['question_text'].apply(lambda x: x.lower())

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

df['treated_question'] = df['lowered_question'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [24]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = re.sub(r'[^\x00-\x7f]',r'', text) 

    return text

df['treated_question'] = df['treated_question'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [4]:
mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

df['treated_question'] = df['treated_question'].apply(lambda x: correct_spelling(x, mispell_dict))

# Check coverage

In [6]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
paragram =  '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
wiki_news = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'

print("Extracting GloVe embedding")
embed_glove = load_embed(glove)
print("Extracting Paragram embedding")
embed_paragram = load_embed(paragram)
print("Extracting FastText embedding")
embed_fasttext = load_embed(wiki_news)

Extracting GloVe embedding
Extracting Paragram embedding
Extracting FastText embedding


In [7]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [8]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [35]:
vocab = build_vocab(df['treated_question'])
print("Glove : ")
oov_glove = check_coverage(vocab, embed_glove)
print("Paragram : ")
oov_paragram = check_coverage(vocab, embed_paragram)
print("FastText : ")
oov_fasttext = check_coverage(vocab, embed_fasttext)

Glove : 
Found embeddings for 64.22% of vocab
Found embeddings for  99.40% of all text
Paragram : 
Found embeddings for 75.07% of vocab
Found embeddings for  99.65% of all text
FastText : 
Found embeddings for 48.79% of vocab
Found embeddings for  98.82% of all text


# sampling

In [32]:
insincere_questions = df.loc[df['target'] == 1]
insincere_questions = insincere_questions.sort_values('treated_question', ascending=True)

In [33]:
insincere_questions[:10]

Unnamed: 0,qid,question_text,target,lowered_question,treated_question
840731,a4c44963530035288b93,I want to blow things up with TNT now what?,1,i want to blow things up with tnt now what?,i want to blow things up with tnt now what ?
419699,524197a26dc8a61ae15b,محمد صلاح صاحب أفضل هدف في نصف نهائي الشامبيون...,1,محمد صلاح صاحب أفضل هدف في نصف نهائي الشامبيون...,?
1264281,f7c403b45ab6dc4bc98e,आपकी कंपनी ने गरीब भारतीयों के लिए क्या चमत्का...,1,आपकी कंपनी ने गरीब भारतीयों के लिए क्या चमत्का...,?
420816,527aac2ce6f12f789fe5,"""",1,"""",""""
397785,4ded3ece84a565118aac,“><img src=x onerror=prompt(1)>,1,“><img src=x onerror=prompt(1)>,""" > < img src = x onerror = p..."
1172612,e5c966735cb11638407a,"""After my election I'll have more flexibility....",1,"""after my election i'll have more flexibility....",""" after my election i will have more fl..."
923525,b4fbce1d18c840701d63,"""All countries support Indian Army to occupy C...",1,"""all countries support indian army to occupy c...",""" all countries support indian army to ..."
723217,8d984587f8458d600285,"""All countries support Indian Army to occupy C...",1,"""all countries support indian army to occupy c...",""" all countries support indian army to ..."
173991,22062c33ea9847f33516,"""All men are created equal"", did the Founding ...",1,"""all men are created equal"", did the founding ...",""" all men are created equal "" ,..."
992417,c2770c52bec6db0d2f91,“Are you on Twitter?” she asks Modi. Was this ...,1,“are you on twitter?” she asks modi. was this ...,""" are you on twitter ? "" she ..."


In [34]:
insincere_questions[-10:]

Unnamed: 0,qid,question_text,target,lowered_question,treated_question
1053646,ce76924b7bb628b1bdc8,Your organisation support groups who advocate ...,1,your organisation support groups who advocate ...,your organization support groups who advocate ...
1299203,fea3668db44eec60d2ee,"Your site sucks, I answered a question perfect...",1,"your site sucks, i answered a question perfect...","your site sucks , i answered a question per..."
154093,1e239d0fbc96e6706198,Your website borders on insanity. It is bias b...,1,your website borders on insanity. it is bias b...,your website borders on insanity . it is bi...
1128601,dd2d33d070aee6f72696,Your website is extremely biased against Trump...,1,your website is extremely biased against trump...,your website is extremely biased against trump...
221213,2b48491e3924f2d04d1d,Yur mem gey?,1,yur mem gey?,yur mem gey ?
1292048,fd3c0b80148e66ab69cd,Zaira has tried to get cheap publicity at the ...,1,zaira has tried to get cheap publicity at the ...,zaira has tried to get cheap publicity at the ...
47023,093453aea9649188eab0,"Zakir Naik is such a moron, Why people are lis...",1,"zakir naik is such a moron, why people are lis...","zakir naik is such a moron , why people are..."
1273684,f99d2e46ee1de4f2ef52,Zionism the political movement to establish a ...,1,zionism the political movement to establish a ...,zionism the political movement to establish a ...
1120370,db8d654f9ba120674bd3,"Zoloft isn’t working, and my psychiatrist want...",1,"zoloft isn’t working, and my psychiatrist want...","zoloft is not working , and my psychiatrist..."
455194,592d9f9f0bcdca6e83fa,ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ...,1,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...
