In [1]:
import pickle
import gensim
import numpy as np
from stop_words import get_stop_words

In [2]:
word2vecFile = "../../../../data/word2vec/GoogleNews-vectors-negative300.bin"
csv_file_name_train = "../../../../data/train_queries.csv"
csv_file_name_test = "../../../../data/valid_queries.csv"

In [3]:
# can take some time ...
model = gensim.models.word2vec.Word2Vec.load_word2vec_format(word2vecFile, binary=True)
stop_words = get_stop_words('en')

In [9]:
def load_training_data(csv_file_name, undersample=False):
    """Loads a CSV file produced by the Java feature generator."""
    X_raw = []
    y_raw = []
    with open(csv_file_name, 'r') as f:
        for line_number, line in enumerate(f.readlines()):
            parts = line[:-1].split(",")
            # label
            y_raw.append(int(parts[0]))
            # query string
            query = parts[1].split(" ")
            if len(query) == 1 and parts[1] not in model:
                if (separateString(parts[1]) is not None):
                    query_temp = separateString(parts[1]).split(" ")
                    query = [word for word in query_temp if len(word)>2 and word not in stop_words]
            X_raw.append(query)
        X_raw = np.array(X_raw)
        y_raw = np.array(y_raw)

        print("Feature shape: {0}".format(X_raw.shape))
        print("Label shape: {0}".format(y_raw.shape))
        return X_raw, y_raw

In [4]:
def isWordInVocab(word):
    if word in model:
        return True
    else:
        return False
    
def isStopword(word):
    if word in stop_words:
        return True
    else:
        return False

In [5]:
import re

def separateString(string):
    words = []
    input = string
    clean = re.sub("[^\d[a-z]]*", "", string.lower().strip())
    return seperateStringRec(clean, clean, words, clean)
    
def seperateStringRec(inp, clean, words,lastLarge=None):
    # Index each character in the input string
    for ind in range(len(inp)):
        # Build a segment
        built = partition(inp, ind)
        # If only one letter remains, steal a letter from the previous match
        if len(built) == 1:
            if (len(words[-1])==0):
                return None
            built = words[-1][-1] + lastLarge
            words[-1] = words[-1][0:-1]
            return seperateStringRec(built,clean,words,lastLarge)
        # Check if segment ends with a digit. Separate it if it does.
        if re.match('\d+$', built):
            built = re.sub("([^\d])\d", "", built)
            words.append(built)
            return seperateStringRec(inp.replace(built, '', 1),clean,words,lastLarge)
        # It does not end with a digit
        else:
            if built in model:
                words.append(built)
                # Check if list of separations joined together is equal to original
                if ''.join(words) == clean:
                    return ' '.join(words)
                else:
                    # Loop back through to separate more
                    lastLarge = inp.replace(built, '', 1)
                    return seperateStringRec(inp.replace(built, '', 1),clean,words,lastLarge)

def partition(value, index):
    return value[0:len(value) - index]



In [27]:
X_train, y_train = load_training_data(csv_file_name_train,undersample=True)
X_valid, y_valid = load_training_data(csv_file_name_test,undersample=True)


train_pos = np.sum(y_train == 1)
train_neg = len(y_train)-train_pos
print("We have {0} positive labels in training set.".format(train_pos))
print("We have {0} negative labels in training set.".format(train_neg))

Feature shape: (497,)
Label shape: (497,)
Feature shape: (249,)
Label shape: (249,)
We have 51 positive labels in training set.
We have 446 negative labels in training set.


In [28]:
classify_empty_as_nonempty = classify_nonempty_as_nonempty = 0
for i in range(len(y_train)):
    if np.any([isWordInVocab(word) for word in X_train[i]]):
        if y_train[i] == 1:
            classify_empty_as_nonempty += 1
            #print("mcAe: {0}".format(X_train[i]))
        else:
            classify_nonempty_as_nonempty += 1
    #else: 
    #    #if y_train[i] == 0:
    #        #print("mcAn: {0}".format(X_train[i]))        
print("this metric would: ")
print("classify {0} empty tweets as empty".format(train_pos-classify_empty_as_nonempty))
print("misclassify {0} empty tweets as empty".format(train_neg-classify_nonempty_as_nonempty))
    

this metric would: 
classify 5 empty tweets as empty
misclassify 3 empty tweets as empty


In [29]:
train_file = open('../../../../data/altered_queries_train.csv', 'w+')
for i in range(len(y_train)):
    query_string = " ".join(X_train[i])
    if query_string == "":
        query_string = "sadljk"
    print("{0},{1}\n".format(y_train[i],query_string), file=train_file)

In [31]:
valid_file = open('../../../../data/altered_queries_valid.csv', 'w+')
for i in range(len(y_valid)):
    query_string = " ".join(X_valid[i])
    if query_string == "":
        query_string = "sadljk"
    print("{0},{1}\n".format(y_valid[i],query_string), file=valid_file)

In [32]:
model['saylor']

KeyError: 'saylor'