In [2]:
from core.tasks import process_deep_entries_data
from core.helpers.common import rm_punc_not_nums, rm_stop_words_txt, translate_to_english_txt, compose
from core.classifiers.feature_selector import DocumentFeatureSelector, BigramFeatureSelector
from core.classifiers.NaiveBayes_classifier import NaiveBayesClassifier
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
import random
import langid
   

In [3]:
csv_file_path = '_playground/sample_data/nlp_out.csv'

print('PROCESSING DEEP ENTRIES DATA')
data = process_deep_entries_data(csv_file_path)
print('DONE')

print('REMOVING PUNC AND STOP WORDS')
stemmer = PorterStemmer()
rm_punc_and_stop = compose(
    rm_punc_not_nums,
    rm_stop_words_txt,
    #stemmer.stem # comment this if we don't need stemming
)
data = [(rm_punc_and_stop(str(ex)), l) for (ex, l) in data if langid.classify(str(ex))[0] == 'en']
print('DONE')

print('SHUFFLING DATA')
random.shuffle(data)
print('DONE')

data_len = len(data)
test_len = int(data_len * 0.4)

print('TAKING OUT TEST/TRAIN DATA')
train_data = data[test_len:]
print("length of training data", len(train_data))
test_data = data[:test_len]
print('DONE')

print('COUNTING TAG FREQUENCIES in TRAIN DATA')
d = {}
for ex, l in train_data:
    d[l] = d.get(l, 0) + 1
print(d)
print('DONE')


PROCESSING DEEP ENTRIES DATA
DONE
REMOVING PUNC AND STOP WORDS
DONE
SHUFFLING DATA
DONE
TAKING OUT TEST/TRAIN DATA
length of training data 14175
DONE
COUNTING TAG FREQUENCIES in TRAIN DATA
{'Protection': 2517, 'WASH': 1156, 'Health': 2679, 'Education': 608, 'Cross': 544, 'Agriculture': 478, 'Logistic': 296, 'Shelter': 1309, 'Food': 2473, 'Nutrition': 646, 'Livelihood': 875, 'NFI': 594}
DONE


In [5]:
print('CREATING FEATURE SELECTOR')
# print(freq_words[:200])
# assert False
selector = DocumentFeatureSelector.new(corpus=data, top=2000) # use top 2000 words
print('DONE')

# print('CREATING BIGRAM FEATURE SELECTOR')
# selector = BigramFeatureSelector.new(corpus=data, top=2000)
# selector = DocumentFeatureSelector.new(corpus=data, top=2000)
# print('DONE')

print('CREATING CLASSIFIER')
classifier = NaiveBayesClassifier.new(selector, rm_punc_and_stop, train_data)
print('DONE')

#print('ACCURACY', classifier.get_accuracy(test_data))

print('CONFUSION MATRIX')
print(classifier.get_confusion_matrix(test_data))

CREATING FEATURE SELECTOR
DONE
CREATING CLASSIFIER
DONE
CONFUSION MATRIX
            |    A                                                        |
            |    g                        L                   P           |
            |    r         E              i              N    r           |
            |    i         d              v    L         u    o           |
            |    c         u              e    o         t    t    S      |
            |    u         c         H    l    g         r    e    h      |
            |    l    C    a         e    i    i         i    c    e      |
            |    t    r    t    F    a    h    s         t    t    l    W |
            |    u    o    i    o    l    o    t    N    i    i    t    A |
            |    r    s    o    o    t    o    i    F    o    o    e    S |
            |    e    s    n    d    h    d    c    I    n    n    r    H |
------------+-------------------------------------------------------------+
Agriculture |  