In [1]:
import nltk
import pandas as pd
import tensorflow as tf
import numpy as np
import sklearn
import langid

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv("C:\\Users\\Tegh\\Documents\\TweetData\\full-corpus.csv")
df = df.sample(frac=1)
corpus = np.array(df['TweetText'])
labels = np.array(df['Topic'])

In [4]:
langid.set_languages(['en','es','fr','pt','nl','it','de'])  # ISO 639-1 codes
codeDict = {'en' : 'english','es' : 'spanish','fr' : 'french','pt' : 'portuguese',
            'nl' : 'dutch','it' : 'italian','de' : 'german'}

langArr = np.array([codeDict[langid.classify(doc)[0]] for doc in corpus])

print('English Tweets = ' + str((langArr == 'english').sum()))
print('Spanish Tweets = ' + str((langArr == 'spanish').sum()))
print('French Tweets = ' + str((langArr == 'french').sum()))

English Tweets = 3660
Spanish Tweets = 672
French Tweets = 205


In [5]:
#Tokenize words
corpus = [word_tokenize(doc) for doc in corpus]

In [6]:
#Stemming, removing stop words, numbers, punctuation
for lang in codeDict.values():
    stops = set(stopwords.words(lang))
    stemmer = SnowballStemmer(lang)
    for index in np.where(langArr == lang)[0]:
        corpus[index] = " ".join([stemmer.stem(w) for w in corpus[index] if (w.isalpha() and w not in stops)])

In [7]:
#Build Tf-idf sparse matrix
vectorizer = TfidfVectorizer(min_df=2,max_df=0.5,ngram_range=(1,2), max_features = 1000)
tfidf_matrix = vectorizer.fit_transform(corpus)
vocab = np.array(vectorizer.get_feature_names())

In [8]:
def randSample(corpus, labels, pct_acq, pct_del):
    n = int((pct_acq + pct_del) * len(corpus))
    indices = np.random.choice(len(corpus), n, replace=False)
    X = [corpus[i] for i in indices]
    Y = [labels[i] for i in indices]
    return X, Y

In [9]:
def entropy(y_probs):
     return -1.0 * np.sum(y_probs * np.log(y_probs + np.finfo(float).eps)) / np.log(y_probs.size)
    
def least_confidence(y_probs):
     return y_probs.size * (1 - np.nanmax(y_probs)) / (y_probs.size - 1)

In [15]:
Iterations = 10
pct_acq = 0.02
pct_del = 0.0
metric = 'entropy'

X, Y = randSample(corpus, labels, pct_acq, pct_del)
vectorizer = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2), max_features = 1000)
tfidf_matrix = vectorizer.fit_transform(X)
model = MultinomialNB()
model.fit(tfidf_matrix, Y)
y_probs = model.predict_proba(vectorizer.transform(corpus))
del vectorizer
del model
for itr in range(10):
    if metric == 'LC':
        uncertainty = pd.DataFrame([least_confidence(y) for y in y_probs]).sort_values(by = 0, ascending = False, axis = 0)
    elif metric == 'entropy':
        uncertainty = pd.DataFrame([entropy(y) for y in y_probs]).sort_values(by = 0, ascending = False, axis = 0)
    n = int((pct_acq + pct_del) * len(corpus))
    X.extend([corpus[i] for i in uncertainty.iloc[:n].index.tolist()])
    Y.extend([labels[i] for i in uncertainty.iloc[:n].index.tolist()])
    vectorizer = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2), max_features = 1000)
    tfidf_matrix = vectorizer.fit_transform(X)
    model = MultinomialNB()
    model.fit(tfidf_matrix, Y)
    print(model.score(vectorizer.transform(corpus), labels))

0.7643262272638373
0.625073342460395
0.5757872090749071
0.560923137101506
0.549970663015842
0.4901232153334637
0.488362996283982
0.48406023860747116
0.48327791902992373
0.48288675924115
