/
11-nltk_categorize_pvs.py
66 lines (54 loc) · 2.88 KB
/
11-nltk_categorize_pvs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 5 15:13:44 2015
@author: alex
"""
'''
The code below uses NLTK to see how well it can predict the category of a given
resolution using a Naive Bayes Classifier.
This code uses the meeting records (inputs) corpus.
'''
import string
from itertools import chain
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk
# working dir: UN/
mydir = 'corpus/meeting_records_final_categorized'
mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]
word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]
numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]]
classifier = nbc.train(train_set)
print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?!
classifier.show_most_informative_features(20)
# for word_features.keys()[:100]
'''
Most Informative Features
immunities = True interv : soft_a = 4.4 : 1.0
designing = True soft_a : interv = 4.2 : 1.0
Western = True soft_a : interv = 3.5 : 1.0
Croat = True interv : soft_a = 3.2 : 1.0
Larsen = True soft_a : interv = 3.1 : 1.0
273 = True interv : soft_a = 2.8 : 1.0
affiliated = True interv : soft_a = 2.5 : 1.0
hanging = True soft_a : interv = 2.5 : 1.0
localized = True soft_a : interv = 2.5 : 1.0
distortions = True soft_a : interv = 2.5 : 1.0
sustaining = True soft_a : interv = 2.2 : 1.0
Initially = True interv : soft_a = 2.0 : 1.0
Pronk = True interv : soft_a = 2.0 : 1.0
shocks = True interv : soft_a = 2.0 : 1.0
regularize = True interv : soft_a = 2.0 : 1.0
steadfastness = True soft_a : interv = 1.9 : 1.0
UNITA = True soft_a : interv = 1.9 : 1.0
ÄúWith = True interv : soft_a = 1.7 : 1.0
rebel = True interv : soft_a = 1.6 : 1.0
succumb = True soft_a : interv = 1.5 : 1.0
'''