In [1]:
import numpy as np
import cPickle
from collections import defaultdict
import sys, re
import pandas as pd

train_ = pd.DataFrame.from_csv('all_train1.csv', index_col=False)
valid_ = pd.DataFrame.from_csv('valid.csv'  , index_col=False)
test_  = pd.DataFrame.from_csv('test.csv'   , index_col=False)

In [2]:
def build_data_cv(data_folder, cv=10, clean_string=True):
    """
    Loads data and split into 10 folds.
    """
    train_revs = []
    valid_revs = []
    test_revs  = []
    train_file = data_folder[0]
    valid_file = data_folder[1]
    test_file  = data_folder[2]
    vocab = defaultdict(float)
    
    a = 0
    b = 0
    
    with open(train_file, "rb") as f:
        for line in f:       
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum  = {"y":train_['label'][a], 
                      "text": orig_rev,                             
                      "num_words": len(orig_rev.split())}
            train_revs.append(datum)
            a += 1
   

    with open(valid_file, "rb") as f:
        for line in f:       
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum  = {"y":valid_['label'][b], 
                      "text": orig_rev,                             
                      "num_words": len(orig_rev.split())}
            valid_revs.append(datum)
            b += 1
        
    
    with open(test_file, "rb") as f:
        for line in f:       
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            words = set(orig_rev.split())
            for word in words:
                vocab[word] += 1
            datum  = {"text": orig_rev,                             
                      "num_words": len(orig_rev.split())}
            test_revs.append(datum)
            
    return train_revs, valid_revs, test_revs, vocab
    
def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1, k), dtype='float32')            
    W[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map

def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)   
            if word in vocab:
                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')  
            else:
                f.read(binary_len)
    return word_vecs

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)  

def clean_str(string, TREC=False):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip() if TREC else string.strip().lower()

def clean_str_sst(string):
    """
    Tokenization/string cleaning for the SST dataset
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)   
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

def convert_keys_to_string(dictionary):
    """Recursively converts dictionary keys to strings."""
    if not isinstance(dictionary, dict):
        return dictionary
    return dict((str(k), convert_keys_to_string(v)) 
        for k, v in dictionary.items())

In [3]:
data_folder = ["all_train1.txt","valid_all.txt","test_.txt"]    
print "Loading data...",        
train_revs, valid_revs, test_revs, vocab = build_data_cv(data_folder, cv=10, clean_string=True)
max_l_train = np.max(pd.DataFrame(train_revs)["num_words"])
max_l_valid = np.max(pd.DataFrame(valid_revs)["num_words"])
max_l_test  = np.max(pd.DataFrame(test_revs)["num_words"])
print "Data Loaded!"
print "Number of train sentences: " + str(len(train_revs))
print "Number of valid sentences: " + str(len(valid_revs))
print "Number of test sentences: "  + str(len(test_revs))
print "Vocab size: " + str(len(vocab))
print "Max train sentence length: " + str(max_l_train)
print "Max valid sentence length: " + str(max_l_valid)
print "Max test sentence length: "  + str(max_l_test)
vocab = convert_keys_to_string(vocab)

Loading data... Data Loaded!
Number of train sentences: 60639
Number of valid sentences: 872
Number of test sentences: 1821
Vocab size: 17199
Max train sentence length: 52
Max valid sentence length: 47
Max test sentence length: 53


In [4]:
import gensim

gensim_file50 ='50ensemble_vectors_epoch50.txt'
model50  = gensim.models.Word2Vec.load_word2vec_format(gensim_file50 , binary=False) 

gensim_file100='50ensemble_vectors_epoch100.txt'
model100 = gensim.models.Word2Vec.load_word2vec_format(gensim_file100, binary=False) 

gensim_file150='50ensemble_vectors_epoch150.txt'
model150 = gensim.models.Word2Vec.load_word2vec_format(gensim_file150, binary=False) 

gensim_file200='50ensemble_vectors_epoch200.txt'
model200 = gensim.models.Word2Vec.load_word2vec_format(gensim_file200, binary=False) 

gensim_file250='50ensemble_vectors_epoch250.txt'
model250 = gensim.models.Word2Vec.load_word2vec_format(gensim_file250, binary=False) 

gensim_file300='50ensemble_vectors_epoch300.txt'
model300 = gensim.models.Word2Vec.load_word2vec_format(gensim_file300, binary=False) 

ae50 = {}
for item in vocab.keys():
    try:
        ae50[item] = model50[item] 
    except KeyError:
        continue
print 'Autoencoder50 model loaded.'

ae100 = {}
for item in vocab.keys():
    try:
        ae100[item] = model100[item] 
    except KeyError:
        continue
print 'Autoencoder100 model loaded.'

ae150 = {}
for item in vocab.keys():
    try:
        ae150[item] = model150[item] 
    except KeyError:
        continue
print 'Autoencoder150 model loaded.'

ae200 = {}
for item in vocab.keys():
    try:
        ae200[item] = model200[item] 
    except KeyError:
        continue
print 'Autoencoder200 model loaded.'

ae250 = {}
for item in vocab.keys():
    try:
        ae250[item] = model250[item] 
    except KeyError:
        continue
print 'Autoencoder250 model loaded.'

ae300 = {}
for item in vocab.keys():
    try:
        ae300[item] = model300[item] 
    except KeyError:
        continue
print 'Autoencoder300 model loaded.'

Autoencoder50 model loaded.
Autoencoder100 model loaded.
Autoencoder150 model loaded.
Autoencoder200 model loaded.
Autoencoder250 model loaded.
Autoencoder300 model loaded.


Using gpu device 0: GeForce GT 755M (CNMeM is enabled with initial size: 80.0% of memory, CuDNN not available)


In [5]:
add_unknown_words(ae50, vocab)
W1, word_idx_map = get_W(ae50)

add_unknown_words(ae100, vocab)
W2, _ = get_W(ae100)

add_unknown_words(ae150, vocab)
W3, _ = get_W(ae150)

add_unknown_words(ae200, vocab)
W4, _ = get_W(ae200)

add_unknown_words(ae250, vocab)
W5, _ = get_W(ae250)

add_unknown_words(ae300, vocab)
W6, _ = get_W(ae300)

cPickle.dump([train_revs, valid_revs, test_revs, W1, W2, W3, W4, W5, W6, word_idx_map, vocab], open("mr.p50", "wb"))
print "Dataset created with Autoencoder Ensemble word vectors"

Dataset created with Autoencoder Ensemble word vectors
