In [1]:
import nltk
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
import langid
import matplotlib.pyplot as plt
import fasttext

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer, SpanishStemmer

In [3]:
#Load documents in fasttext format
raw_train = "C:\\Users\\Tegh\\fasttext\\amazon_reviews\\train.ft.txt"
raw_test = "C:\\Users\\Tegh\\fasttext\\amazon_reviews\\test.ft.txt"

In [4]:
langid.set_languages(['en','es'])  # ISO 639-1 codes
EngStemmer = EnglishStemmer()
EngStops = set(stopwords.words('english'))

EspStemmer = SpanishStemmer()
EspStops = set(stopwords.words('spanish'))

In [16]:
from tqdm import tqdm
import mmap

def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

In [None]:
#Expect precision and recall of 0.916 if all is in order for full train
#Throwout reviews with encoding errors

count = 0
f_train = open(raw_train, 'r')
out_train = open('C:\\Users\\Tegh\\fasttext\\amazon_reviews\\out_train.txt', 'w')

for i in tqdm(range(get_num_lines(raw_train))):
    try:
        line = f_train.readline()
        lang = langid.classify(line)[0]
        if not line:
            break
        line = line.split(" ", 1)
        line[1] = word_tokenize(line[1])
        if lang == 'en':
            line[1] = " ".join([EngStemmer.stem(w) for w in line[1] if (w.isalpha() and w not in EngStops)])
        else:
            line[1] = " ".join([EspStemmer.stem(w) for w in line[1] if (w.isalpha() and w not in EspStops)])
        out_train.write(line[0] + " " + line[1] + "\n")
    except Exception as e:
        count += 1
        continue
f_train.close()
out_train.close()
print("Exceptions in train set: " + str(count))

  2%|██▍                                                                                                                  | 76574/3600000 [04:37<2:03:22, 475.97it/s]

In [21]:
count = 0
f_test = open(raw_test, 'r')
out_test = open('C:\\Users\\Tegh\\fasttext\\amazon_reviews\\out_test.txt', 'w')

for line in tqdm(f_test, total=get_num_lines(raw_test)):
    try:
        lang = langid.classify(line)[0]
        if not line:
            break
        line = line.split(" ", 1)
        line[1] = word_tokenize(line[1])
        if lang == 'en':
            line[1] = " ".join([EngStemmer.stem(w) for w in line[1] if (w.isalpha() and w not in EngStops)])
        else:
            line[1] = " ".join([EspStemmer.stem(w) for w in line[1] if (w.isalpha() and w not in EspStops)])
        out_test.write(line[0] + " " + line[1] + "\n")
    except Exception as e:
        count += 1
        continue
f_test.close()
out_test.close()
print("Exceptions in test set: " + str(count))

  0%|                                                                                                                           | 17/400000 [00:00<25:48, 258.30it/s]


UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 282: character maps to <undefined>

In [24]:
def randSample(docs, pct_acq, pct_del = 0):
    n = int((pct_acq + pct_del) * len(docs))
    indices = np.random.choice(len(docs), n, replace=False)
    return [docs[i] for i in indices]

def dropout(X, Y, pct_acq, pct_del):
    n = int(pct_acq / (pct_acq + pct_del) * len(X))
    indices = np.random.choice(len(X), n, replace=False)
    return [X[i] for i in indices]

['__label__2'
 list(['stune', 'even', 'this', 'sound', 'track', 'beauti', 'it', 'paint', 'seneri', 'mind', 'well', 'i', 'would', 'recomend', 'even', 'peopl', 'hate', 'vid', 'game', 'music', 'i', 'play', 'game', 'chrono', 'cross', 'game', 'i', 'ever', 'play', 'best', 'music', 'it', 'back', 'away', 'crude', 'keyboard', 'take', 'fresher', 'step', 'grate', 'guitar', 'soul', 'orchestra', 'it', 'would', 'impress', 'anyon', 'care', 'listen'])]


In [None]:
def entropy(y_probs):
    return -1.0 * np.sum(y_probs * np.log(y_probs + np.finfo(float).eps)) / np.log(y_probs.size)
    
def least_confidence(y_probs):
    return y_probs.size * (1 - np.nanmax(y_probs)) / (y_probs.size - 1)

In [None]:
def fasttext(iters, pct_acq, metric, pct_del = 0):
    accuracy = list()
    X = randSample(x_train, pct_acq, pct_del)
    if pct_del > 0:
        X = dropout(X, pct_acq, pct_del)
    model = train_supervised('train.txt', autotuneValidationFile='valid.txt')
    y_probs = model.predict(doc for doc in train)
    for itr in range(iters):
        if metric == 'LC':
            uncertainty = pd.DataFrame([least_confidence(y) for y in y_probs]).sort_values(by = 0, ascending = False, axis = 0)
        elif metric == 'entropy':
            uncertainty = pd.DataFrame([entropy(y) for y in y_probs]).sort_values(by = 0, ascending = False, axis = 0)
        n = int((pct_acq + pct_del) * len(train))
        subX = [train[i] for i in uncertainty.iloc[:n].index.tolist()]
        if pct_del > 0:
            subX = dropout(subX, pct_acq, pct_del)
        #Append lines to txt file
        X.extend(subX)
        model = train_supervised('train.txt', autotuneValidationFile='valid.txt')
        y_probs = model.predict(doc for doc in train)
    return model