In [None]:
from core.tasks import process_deep_entries_data
from core.helpers.common import rm_punc_not_nums, rm_stop_words_txt, translate_to_english_txt, compose
from core.feature_selectors import UnigramFeatureSelector, BigramFeatureSelector
from core.classifiers.NaiveBayes_classifier import NaiveBayesClassifier
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
import random
import langid
   

In [11]:
csv_file_path = '_playground/sample_data/nlp_out.csv'

print('PROCESSING DEEP ENTRIES DATA')
data = process_deep_entries_data(csv_file_path)[:5000]
print('DONE')

print('REMOVING PUNC AND STOP WORDS')
stemmer = PorterStemmer()
rm_punc_and_stop = compose(
    rm_punc_not_nums,
    rm_stop_words_txt,
    #stemmer.stem # comment this if we don't need stemming
)
data = [(rm_punc_and_stop(str(ex)), l) for (ex, l) in data if langid.classify(str(ex))[0] == 'en']
print('DONE')

tags_data = {}
for ex, l in data:
    tags_data[l] = tags_data.get(l, '') + " "+ str(ex)
    
all_tokenized_documents = list(map(lambda x:x.split(), [v for k, v in tags_data.items()]))

print('SHUFFLING DATA')
random.shuffle(data)
print('DONE')

data_len = len(data)
test_len = int(data_len * 0.4)

print('TAKING OUT TEST/TRAIN DATA')
train_data = data[test_len:]
print("length of training data", len(train_data))
test_data = data[:test_len]
print('DONE')

print('COUNTING TAG FREQUENCIES in TRAIN DATA')
d = {}
for ex, l in train_data:
    d[l] = d.get(l, 0) + 1
print(d)
print('DONE')


PROCESSING DEEP ENTRIES DATA
DONE
REMOVING PUNC AND STOP WORDS
DONE
SHUFFLING DATA
DONE
TAKING OUT TEST/TRAIN DATA
length of training data 2637
DONE
COUNTING TAG FREQUENCIES in TRAIN DATA
{'Protection': 527, 'Logistic': 109, 'WASH': 188, 'Education': 111, 'Cross': 178, 'Agriculture': 116, 'Nutrition': 113, 'Health': 361, 'Livelihood': 218, 'NFI': 107, 'Shelter': 215, 'Food': 394}
DONE


In [12]:
print('CREATING FEATURE SELECTOR')
from core.tf_idf import relevant_terms
most_relevant_terms = list(relevant_terms(all_tokenized_documents))
selector = UnigramFeatureSelector.new(freq_words=most_relevant_terms)
#selector = UnigramFeatureSelector.new(corpus=data, top=2000) # use top 2000 words
print('DONE')

# print('CREATING BIGRAM FEATURE SELECTOR')
# selector = BigramFeatureSelector.new(corpus=data, top=2000)
# selector = DocumentFeatureSelector.new(corpus=data, top=2000)
# print('DONE')

print('CREATING CLASSIFIER')
classifier = NaiveBayesClassifier.new(selector, rm_punc_and_stop, train_data)
print('DONE')

print('CALCULATING ACCURACY')
print(classifier.get_accuracy(test_data))

print('CONFUSION MATRIX')
print(classifier.get_confusion_matrix(test_data))

CREATING FEATURE SELECTOR
DONE
CREATING CLASSIFIER
DONE
CALCULATING ACCURACY
0.18725099601593626
CONFUSION MATRIX


KeyboardInterrupt: 