In [1]:
# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
import pandas as pd
from utils import tokenization
from collections import Counter
import numpy as np

## Read in Data and Tokenize
We use the basic tokenizer (prior to the wordpiece tokenization) for this part

In [2]:
# Training data fround in a previously generated pickle file (dataframe)
train_data = 'Data/train/train.pkl'

# Using the Basic Tokenizer which just separates into words and punctuation (not wordpieces)
tokenizer = tokenization.BasicTokenizer()

train_df = pd.read_pickle(train_data)
words_df = pd.DataFrame()

# Each row of dataframe will have a column containing a list of the tokenized sequence
words_df['words'] = train_df.text.map(lambda x: tokenizer.tokenize(x))

## Check the tokenization

In [36]:
words_df.sample(10)

Unnamed: 0,words,tf,idf
3616,"[he, snatched, it, one, -, handed, from, the, ...","[3, 1, 2, 1, 1, 1, 2, 8, 1, 3, 1, 2, 1, 3, 2, ...","[1.3998152, 6.4415517, 1.609062, 2.3551874, 2...."
8155,"[some, believe, baelor, was, deranged, by, all...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 1, 5, 1, 1, 1, ...","[3.081176, 4.8825974, 6.020819, 1.4650301, 9.9..."
4685,"[come, ., ., ., closer, ., his, men, brought, ...","[1, 17, 17, 17, 1, 17, 4, 1, 1, 1, 1, 17, 1, 1...","[3.2699287, 1.0, 1.0, 1.0, 5.401003, 1.0, 1.46..."
2857,"[she, spun, away, and, said, to, him, ,, no, f...","[2, 1, 1, 7, 2, 1, 1, 10, 2, 1, 1, 2, 5, 2, 1,...","[1.963341, 6.1273026, 3.496303, 1.093492, 2.07..."
7971,"[poisons, and, potions, were, for, the, aftern...","[2, 8, 1, 1, 3, 4, 1, 5, 4, 2, 2, 8, 2, 8, 2, ...","[8.536497, 1.093492, 7.5714164, 2.2945173, 1.8..."
13483,"[no, ,, cressen, thought, ., nor, will, they, ...","[2, 7, 1, 1, 15, 1, 1, 2, 1, 15, 1, 2, 1, 7, 1...","[2.1970203, 1.0010004, 6.472804, 3.1298862, 1...."
8542,"[weasel, ,, weese, purred, ,, next, time, i, s...","[1, 7, 1, 1, 7, 1, 1, 2, 2, 1, 1, 1, 1, 7, 2, ...","[6.8547387, 1.0010004, 6.9023666, 8.536497, 1...."
10869,"[the, smoke, was, stinging, his, eyes, ., tyri...","[5, 1, 2, 1, 3, 1, 6, 1, 1, 3, 1, 3, 1, 1, 1, ...","[1.014097, 5.4064527, 1.4650301, 7.725567, 1.4..."
12295,"[robb, and, bran, and, rickon, ., they, ’, re,...","[1, 2, 1, 2, 1, 11, 1, 4, 1, 1, 11, 5, 2, 2, 1...","[4.0967917, 1.093492, 4.050674, 1.093492, 5.87..."
4991,"[you, never, killed, a, woman, before, ,, did,...","[5, 2, 1, 3, 1, 1, 8, 1, 5, 2, 1, 2, 1, 1, 1, ...","[1.7426108, 3.1181772, 4.573306, 1.1875076, 3...."


## Create Function for Generating Word Counts
These are term frequencies - counts of each word per sequence.

In [5]:
# Function which will be used to generate a list of counts by word for each sequence
def term_freq(seq):
    counts = Counter(seq)
    tf = []
    for word in seq:
        tf.append(counts[word])
    return np.array(tf,dtype='int32')

## Create a TF-IDF Vectorizer
We want the inverse document frequencies for each word in the corpus.

In [6]:
# tfidf vectorizer from sklearn with the Basic tokenizer
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize)
vectorizer.fit_transform(train_df.text)

# Create a dictionary of words to idf values
idf_by_feature = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

## Function that will be used to add IDFs for each word in each sequence of our corpus

In [7]:
def idf(seq):
    idfs = []
    for word in seq:
        idfs.append(idf_by_feature[word])
    return np.array(idfs,dtype='float32')

## Add tf and idf to the dataframe

In [8]:
words_df['tf'] = words_df.words.map(lambda x: term_freq(x))
words_df['idf'] = words_df.words.map(lambda x: idf(x))

## Generate Word Sampling and Word Replacement PDFs

In [38]:
# Generate a sequence that is the entire training corpus
word_list = []
for seq in words_df.words:
    word_list += seq
    
# Generate a count for each word
wf_by_feature = Counter(word_list)

# Calculate word scores by multiply word frequency by idf
word_scores = {key:value*idf_by_feature[key] for key, value in wf_by_feature.items()}    

# Max score
max_score = max(word_scores.values())

# Z-prime is how we will normalize our scores into a pdf (i.e., word probs need to sum to 1)
z_prime = np.sum(max_score - np.array(list(word_scores.values())))

word_probs = {key:(max_score-value)/z_prime for key, value in word_scores.items()}     

# Check our math
print("Sum of word sampling probabilities: {:5.4f}".format(sum(word_probs.values())))


Sum of word sampling probabilities: 1.0000


In [25]:
word_probs = pd.DataFrame()

In [21]:
word_probs['word'] = word_scores.keys()
word_probs['sampling_prob'] = word_probs.word.map(lamb)

Unnamed: 0,0
0,she
1,would
2,have
3,been
4,my
5,daughter
6,","
7,if
8,the
9,mad
