In [3]:
import pickle
import numpy as np
import nltk
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import brown
from nltk.corpus import reuters
import sys
import re

In [13]:
csv_file_name_train = "../../../../data/lululu_queries.csv"
#csv_file_name_test = "../../../../data/lululu_queries.csv"

In [5]:
# can take some time ...
oo_wordlist = set(w.lower() for w in nltk.corpus.words.words())
brown_wordlist = set(w.lower() for w in brown.words())
reuters_wordlist = set(w.lower() for w in reuters.words())

wordlist = (oo_wordlist.union(brown_wordlist)).union(reuters_wordlist)
stop_words = get_stop_words('en')
wnl = WordNetLemmatizer()

In [14]:
def separateString(string):
    words = []
    input = string
    clean = re.sub("[^\d[a-z]]*", "", string.lower().strip())
    return seperateStringRec(clean, clean, words, clean)
    
def seperateStringRec(inp, clean, words,lastLarge=None):
    # Index each character in the input string
    for ind in range(len(inp)):
        # Build a segment
        built = partition(inp, ind)
        # If only one letter remains, steal a letter from the previous match
        if len(built) == 1:
            if (len(words) == 0 or len(words[-1]) == 0):
                return None
            built = words[-1][-1] + lastLarge
            words[-1] = words[-1][0:-1]
            return seperateStringRec(built,clean,words,lastLarge)
        # Check if segment ends with a digit. Separate it if it does.
        if re.match('\d+$', built):
            built = re.sub("([^\d])\d", "", built)
            words.append(built)
            return seperateStringRec(inp.replace(built, '', 1),clean,words,lastLarge)
        # It does not end with a digit
        else:
            if built in wordlist:
                words.append(built)
                # Check if list of separations joined together is equal to original
                if ''.join(words) == clean:
                    return ' '.join(words)
                else:
                    # Loop back through to separate more
                    lastLarge = inp.replace(built, '', 1)
                    return seperateStringRec(inp.replace(built, '', 1),clean,words,lastLarge)

def partition(value, index):
    return value[0:len(value) - index]

In [10]:
def preprareForSplit(word):
    temp = re.sub("[^[a-z]]*", " ", word.lower().strip()).split(" ")
    return " ".join([word for word in temp if len(word)>1])


def clean(word):
    return re.sub("[^\d[a-z]]*", " ", word.lower().strip())

def keepWord(word):
    if word in stop_words:
        return False
    lemma = wnl.lemmatize(word) 
    if lemma in wordlist:
        return True
    else:
        if len(word)>3 and separateString(word) is not None:
            return True
        else:
            return False

def load_training_data(csv_file_name, undersample=False):
    """Loads a CSV file produced by the Java feature generator."""
    X_raw = []
    y_raw = []
    with open(csv_file_name, 'r') as f:
        for line_number, line in enumerate(f.readlines()):
            parts = line[:-1].split(",")
            # label
            y_raw.append(int(parts[0]))
            # query string
            clean_string = clean(parts[1])
            if clean_string is not None:
                query_temp = clean_string.split(" ")
                query = [word for word in query_temp]
                if len(query_temp) == 1 and clean_string not in wordlist:
                    prepare = preprareForSplit(parts[1])
                    if (len(prepare) > 2 and separateString(prepare) is not None):
                        query_temp = separateString(prepare).split(" ")
                        query = [word for word in query_temp if keepWord(word)]

                X_raw.append(query)
            else:
                X_raw.append("")
        X_raw = np.array(X_raw)
        y_raw = np.array(y_raw)

        print("Feature shape: {0}".format(X_raw.shape))
        print("Label shape: {0}".format(y_raw.shape))
        return X_raw, y_raw

In [15]:
string = "s10e6swapsadvancedadapters"
clean_string = clean(string)
if clean_string is not None:
    query_temp = clean_string.split(" ")
    if len(query_temp) == 1 and clean_string not in wordlist:
        prepare = preprareForSplit(string)
        if (len(prepare) > 2 and separateString(prepare) is not None):
            query_temp = separateString(preprareForSplit(string)).split(" ")
    print(query_temp)
    query = [word for word in query_temp if keepWord(word)]
    print(query)

['swaps', 'advanced', 'adapters']
['swaps', 'advanced', 'adapters']


In [16]:
def isWordInVocab(word):
    if word in wordlist:
        return True
    else:
        return False
    
def isStopword(word):
    if word in stop_words:
        return True
    else:
        return False

In [17]:
X_train, y_train = load_training_data(csv_file_name_train,undersample=True)
#X_valid, y_valid = load_training_data(csv_file_name_test,undersample=True)


train_pos = np.sum(y_train == 1)
train_neg = len(y_train)-train_pos
print("We have {0} positive labels in training set.".format(train_pos))
print("We have {0} negative labels in training set.".format(train_neg))

Feature shape: (246,)
Label shape: (246,)
We have 26 positive labels in training set.
We have 220 negative labels in training set.


In [19]:
classify_empty_as_nonempty = classify_nonempty_as_nonempty = 0
for i in range(len(y_train)):
    query_string = " ".join(X_train[i])
    if query_string is not "":
        if y_train[i] == 1:
            classify_empty_as_nonempty += 1
            #print("mcAe: {0}".format(X_train[i]))
        else:
            classify_nonempty_as_nonempty += 1
    #else: 
    #    #if y_train[i] == 0:
    #        #print("mcAn: {0}".format(X_train[i]))        
print("this metric would: ")
print("classify {0} empty tweets as empty".format(train_pos-classify_empty_as_nonempty))
print("misclassify {0} empty tweets as empty".format(train_neg-classify_nonempty_as_nonempty))
    

this metric would: 
classify 0 empty tweets as empty
misclassify 0 empty tweets as empty


In [21]:
train_file = open('../../../../data/altered_queries_lululu_dict.csv', 'w+')
for i in range(len(y_train)):
    query_string = " ".join(X_train[i])
    if query_string == "":
        query_string = "-"
    print("{0},{1}".format(y_train[i],query_string), file=train_file)
print(i)


245


In [9]:
"""
def load_training_data(csv_file_name, undersample=False):
    X_raw = []
    y_raw = []
    with open(csv_file_name, 'r') as f:
        for line_number, line in enumerate(f.readlines()):
            parts = line[:-1].split(",")
            # label
            y_raw.append(int(parts[0]))
            # query string
            query = parts[1].split(" ")
            if len(query) == 1 and parts[1] not in model:
                if (separateString(parts[1]) is not None):
                    query_temp = separateString(parts[1]).split(" ")
                    query = [word for word in query_temp if (len(word) > 2 and word not in stop_words)]
            X_raw.append(query)
        X_raw = np.array(X_raw)
        y_raw = np.array(y_raw)

        print("Feature shape: {0}".format(X_raw.shape))
        print("Label shape: {0}".format(y_raw.shape))
        return X_raw, y_raw
"""

'\ndef load_training_data(csv_file_name, undersample=False):\n    X_raw = []\n    y_raw = []\n    with open(csv_file_name, \'r\') as f:\n        for line_number, line in enumerate(f.readlines()):\n            parts = line[:-1].split(",")\n            # label\n            y_raw.append(int(parts[0]))\n            # query string\n            query = parts[1].split(" ")\n            if len(query) == 1 and parts[1] not in model:\n                if (separateString(parts[1]) is not None):\n                    query_temp = separateString(parts[1]).split(" ")\n                    query = [word for word in query_temp if (len(word) > 2 and word not in stop_words)]\n            X_raw.append(query)\n        X_raw = np.array(X_raw)\n        y_raw = np.array(y_raw)\n\n        print("Feature shape: {0}".format(X_raw.shape))\n        print("Label shape: {0}".format(y_raw.shape))\n        return X_raw, y_raw\n'

In [None]:
def separateString(string):
    words = []
    input = string
    clean = re.sub("[^\d[a-z]]*", "", string.lower().strip())
    return seperateStringRec(clean, clean, words, clean)
    
def seperateStringRec(inp, clean, words,lastLarge=None):
    # Index each character in the input string
    for ind in range(len(inp)):
        # Build a segment
        built = partition(inp, ind)
        # If only one letter remains, steal a letter from the previous match
        if len(built) == 1:
            if (len(words) == 0 or len(words[-1]) == 0):
                return None
            built = words[-1][-1] + lastLarge
            words[-1] = words[-1][0:-1]
            return seperateStringRec(built,clean,words,lastLarge)
        # Check if segment ends with a digit. Separate it if it does.
        if re.match('\d+$', built):
            built = re.sub("([^\d])\d", "", built)
            words.append(built)
            return seperateStringRec(inp.replace(built, '', 1),clean,words,lastLarge)
        # It does not end with a digit
        else:
            if built in wordlist:
                words.append(built)
                # Check if list of separations joined together is equal to original
                if ''.join(words) == clean:
                    return ' '.join(words)
                else:
                    # Loop back through to separate more
                    lastLarge = inp.replace(built, '', 1)
                    return seperateStringRec(inp.replace(built, '', 1),clean,words,lastLarge)

def partition(value, index):
    return value[0:len(value) - index]

def preprareForSplit(word):
    temp = re.sub("[^[a-z]]*", " ", word.lower().strip()).split(" ")
    return " ".join([word for word in temp if len(word)>1])


def clean(word):
    return re.sub("[^\d[a-z]]*", " ", word.lower().strip())

def keepWord(word):
    if word in stop_words:
        return False
    lemma = wnl.lemmatize(word) 
    if lemma in wordlist:
        return True
    else:
        if len(word)>3 and separateString(word) is not None:
            return True
        else:
            return False

def load_training_data(csv_file_name, undersample=False):
    """Loads a CSV file produced by the Java feature generator."""
    X_raw = []
    y_raw = []
    with open(csv_file_name, 'r') as f:
        for line_number, line in enumerate(f.readlines()):
            parts = line[:-1].split(",")
            # label
            y_raw.append(int(parts[0]))
            # query string
            clean_string = clean(parts[1])
            if clean_string is not None:
                query_temp = clean_string.split(" ")
                if len(query_temp) == 1 and clean_string not in wordlist:
                    prepare = preprareForSplit(parts[1])
                    if (len(prepare) > 2 and separateString(prepare) is not None):
                        query_temp = separateString(prepare).split(" ")
                query = [word for word in query_temp if keepWord(word)]
                X_raw.append(query)
            else:
                X_raw.append("")
        X_raw = np.array(X_raw)
        y_raw = np.array(y_raw)

        print("Feature shape: {0}".format(X_raw.shape))
        print("Label shape: {0}".format(y_raw.shape))
        return X_raw, y_raw

In [372]:
wnl.lemmatize("toshiba")

'toshiba'

In [373]:
"toshiba" in reuters_wordlist

True

In [374]:
query_temp = ["aloha", "airlines", "pbase" ]
query = [clean(word) for word in query_temp if keepWord(word)]
print(query)

['aloha', 'airlines', 'pbase']
