### This script will load all the candidate tweets into memory, find the maximum sentence length; 
### then, load the training data into memory, find the corresponding word2vec embedding of each unique word, and save a look up table to file

In [1]:
import gensim
import numpy as np
import pickle
import csv



## Load all the candidate data and find the maximum and mean length

In [2]:
# Load the tweets
TWEETS_PATH = 'dataset/quick_data.csv'
TWEETS_LIST = []
with open(TWEETS_PATH, newline='') as csvfile:
    csvreader = list(csv.reader(csvfile, delimiter=','))
    for row in csvreader[1:]:
        TWEETS_LIST.append(row[2]) # the third row is the original text
        #TWEETS_LIST.append(row[1]) # the second row is the modified text
print('Number of tweets: ',len(TWEETS_LIST))

# Find the max length
current_max_length = 0
sum_length = 0
for tweet in TWEETS_LIST:
    if current_max_length < len(tweet.split(' ')):
        current_max_length = len(tweet.split(' '))
    sum_length += len(tweet.split(' '))
print('Max length: ',current_max_length)
print('Mean length: ', int(sum_length/len(TWEETS_LIST)))

Number of tweets:  1492215
Max length:  84
Mean length:  10


## Load training dataset

In [13]:
# Load training dataset

#TRAIN_DATA_PATH = 'dataset/train_set.csv'
TRAIN_SET_NAME = 'train_set_with_predicted'
TRAIN_DATA_PATH = 'dataset/'+TRAIN_SET_NAME+'.csv'

TWEETS_LIST = []
with open(TRAIN_DATA_PATH, newline='') as csvfile:
    csvreader = list(csv.reader(csvfile, delimiter=','))
    for row in csvreader[1:]:
        TWEETS_LIST.append(row[2]) # the third row is the original text
        #TWEETS_LIST.append(row[1]) # the second row is the modified text
print('Number of tweets in training dataset: ',len(TWEETS_LIST))

# Get all the unique words in training data
UNIQUE_WORDS = []
counter = 0
for tweet in TWEETS_LIST:
    counter+=1
    for word in tweet.split(' '):
        if word in UNIQUE_WORDS:
            continue
        else:
            UNIQUE_WORDS.append(word)
print('Number of unique words: ',len(UNIQUE_WORDS))

Number of tweets in training dataset:  12660
Number of unique words:  17025


## Load Twitter Word2Vec model (dimension: k=400)

In [4]:
#MODEL_PATH = '/Users/matthew/Downloads/GoogleNews-vectors-negative300.bin' # GoogleNews word2vec model
MODEL_PATH = 'dataset/word2vec_twitter_model.bin'
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_PATH, binary=True, limit=3000000) # load 3,000,000 word maps

## Generate lookup table

In [14]:
LOOKUP_DICT = dict()
OOV_WORDS = [] # Words out of vocabulary (OOV)

for word in UNIQUE_WORDS:
    try:
        LOOKUP_DICT[word] = np.array(w2v_model[word]) # add to lookup table
    except:
        OOV_WORDS.append(word)

print('OOV words: ',len(OOV_WORDS))
print('OOV ratio (%): ',100*len(OOV_WORDS)/len(UNIQUE_WORDS))

OOV words:  1539
OOV ratio (%):  9.039647577092511


## Save lookup table to file

In [15]:
with open('dataset/'+TRAIN_SET_NAME+'.lookup', 'wb') as f:
    pickle.dump(LOOKUP_DICT, f, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved')

Saved


## Test loading the lookup table

In [16]:
with (open('dataset/'+TRAIN_SET_NAME+'.lookup', 'rb')) as openfile:
    try:
        LOADED_TABLE = pickle.load(openfile)
        print(len(LOADED_TABLE))
        print(LOADED_TABLE['you'] - LOOKUP_DICT['you'])
    except EOFError:
        print('ERROR!')

15486
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.