# Neural Network word processing toolbox


Originally used in project: [Jigsaw Unintended Bias in Toxicity Classification](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification)



## Load embeddings

In [0]:
# Assign coefficient to word
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [0]:
# Load embeddings from a file path and assign coefficients
def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [0]:
filepath_crawl = '/tmp/crawl-300d-2M.vec'
embeddings_crawl = load_embeddings(filepath_crawl)

## Check vocabulary and word coverage





In [0]:
def build_vocab(texts):
    """
    Build vocabulary from an array of strings
    Counts number of apparatitions of each word in the entire array.
    
    @args:
    - texts = array of strings
    
    @returns dictionary of word / occurences
    """
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [0]:
def check_coverage(vocab, embeddings_index):
    """
    Looks for each word in a vocabulary in a word embeddings index
    
    @args:
    - vocab: vocabulary (dictionary of word / occurences)
    - embeddings_index
    
    @returns: dictionary of unrecognized words and number of occurences
    """
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [0]:
# Check coverage of each embedding(s)
oov_crawl = check_coverage(vocab, embeddings_crawl)

## Clean contractions

In [0]:
# Map the missing contractions
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [0]:
# Clean contractions
train['comment_text'] = train['comment_text'].apply(lambda x: clean_contractions(x, CONTRACTION_LOOKUP_EN))