In [2]:
from core.tasks import process_deep_entries_data
from core.helpers.common import (
    rm_punc_not_nums, rm_punc_not_nums_list,
    rm_stop_words_txt, rm_stop_words_txt_list,
    translate_to_english_txt,
    compose
)
from core.feature_selectors import UnigramFeatureSelector, BigramFeatureSelector
from core.classifiers.NaiveBayes_classifier import NaiveBayesClassifier
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
import random
from nltk.corpus import names, movie_reviews
import langid   

In [5]:
csv_file_path = '_playground/sample_data/nlp_out.csv'

print('PROCESSING DEEP ENTRIES DATA')
data = process_deep_entries_data(csv_file_path)[:15000]
print('DONE')

print('REMOVING PUNC AND STOP WORDS')
stemmer = PorterStemmer()
rm_punc_and_stop = compose(
    rm_punc_not_nums_list,
    rm_stop_words_txt_list,
    lambda x: list(map(str.lower, x))
    #stemmer.stem # comment this if we don't need stemming
)
#rm_punc_and_stop = lambda x: x
data = [(rm_punc_and_stop(str(ex).split()), l) for (ex, l) in data if langid.classify(str(ex))[0] == 'en']
print('DONE')

#data = [(list(movie_reviews.words(fileid)), category)
#       for category in movie_reviews.categories()
#      for fileid in movie_reviews.fileids(category)
#]
#print(data[0])
tags_data = {}
for ex, l in data:
    tags_data[l] = tags_data.get(l, '') + " "+ str(ex)
    
all_tokenized_documents = list(map(lambda x:x.split(), [v for k, v in tags_data.items()]))

print('SHUFFLING DATA')
random.shuffle(data)
print('DONE')

data_len = len(data)
test_len = int(data_len * 0.25)

print('TAKING OUT TEST/TRAIN DATA')
train_data = data[test_len:]
print("length of training data", len(train_data))
test_data = data[:test_len]
print('DONE')

print('COUNTING TAG FREQUENCIES in TRAIN DATA')
d = {}
for ex, l in train_data:
    d[l] = d.get(l, 0) + 1
print(d)
print('DONE')


PROCESSING DEEP ENTRIES DATA
DONE
REMOVING PUNC AND STOP WORDS
DONE
SHUFFLING DATA
DONE
TAKING OUT TEST/TRAIN DATA
length of training data 9326
DONE
COUNTING TAG FREQUENCIES in TRAIN DATA
{'WASH': 716, 'Livelihood': 682, 'Education': 410, 'NFI': 454, 'Protection': 1793, 'Food': 1546, 'Shelter': 926, 'Cross': 407, 'Health': 1405, 'Logistic': 268, 'Agriculture': 350, 'Nutrition': 369}
DONE


In [6]:
print('CREATING FEATURE SELECTOR')
from core.tf_idf import relevant_terms
#most_relevant_terms = list(relevant_terms(all_tokenized_documents))
#selector = UnigramFeatureSelector.new(freq_words=most_relevant_terms)
selector = UnigramFeatureSelector.new(corpus=data, top=2000) # use top 2000 words
print('DONE')

# print('CREATING BIGRAM FEATURE SELECTOR')
# selector = BigramFeatureSelector.new(corpus=data, top=2000)
# selector = DocumentFeatureSelector.new(corpus=data, top=2000)
# print('DONE')

print('CREATING CLASSIFIER')
classifier = NaiveBayesClassifier.new(selector, rm_punc_and_stop, train_data)
print('DONE')

print('CALCULATING ACCURACY')
print(classifier.get_accuracy(test_data))

#print('CONFUSION MATRIX')
#print(classifier.get_confusion_matrix(test_data))

CREATING FEATURE SELECTOR
DONE
CREATING CLASSIFIER
DONE
CALCULATING ACCURACY
0.5791505791505791


In [7]:
print(classifier.get_confusion_matrix(test_data))

            |   A                                             |
            |   g                   L               P         |
            |   r       E           i           N   r         |
            |   i       d           v   L       u   o         |
            |   c       u           e   o       t   t   S     |
            |   u       c       H   l   g       r   e   h     |
            |   l   C   a       e   i   i       i   c   e     |
            |   t   r   t   F   a   h   s       t   t   l   W |
            |   u   o   i   o   l   o   t   N   i   i   t   A |
            |   r   s   o   o   t   o   i   F   o   o   e   S |
            |   e   s   n   d   h   d   c   I   n   n   r   H |
------------+-------------------------------------------------+
Agriculture | <57>  .   3   9   .   5   4   .   2   2   7   4 |
      Cross |   7 <28>  4  10   2   2   7   9   5  19  20  10 |
  Education |   .   3 <98>  4   4   5   4   1   .  12   5   5 |
       Food |  39  14   6<246> 11  24  2