forked from stroypet/EmotionTweetClassifier_3412260
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_semeval_classifier.py
149 lines (121 loc) · 5.2 KB
/
run_semeval_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########################################################################
############## Semeval - Sentiment Analysis in Twitter #################
#########################################################################
####
#### Authors: Pedro Paulo Balage Filho e Lucas Avanço
#### Version: 2.0
#### Date: 26/03/14
####
#### This python script provides a tempalte to run the hybrid sentiment classifier for Semeval 2014 Task 9
#### Information about Semeval format can be found at:
#### http://alt.qcri.org/semeval2014/task9/
####
# Python 3 compatibility
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import with_statement
from __future__ import unicode_literals
# Import the libraries created for this task
from SemevalTwitter import SemevalTwitter
from TwitterHybridClassifier import TwitterHybridClassifier
from nltk import ConfusionMatrix
from pprint import pprint
import codecs
# Print the confusion matrix for classification
def confusion_matrix(gold, guess):
correct = 0
total = len(gold)
for i in range(len(gold)):
if guess[i] == gold[i]:
correct += 1
accuracy = float(correct) / float(total)
print('Accuracy: {:.2%}'.format(accuracy))
# Confusion Matrix
cm = ConfusionMatrix(gold, guess)
print(cm)
# These are the paths I used in my experiment. Due SemEval constraints
# I could not provide these data. However, if you e-mail me,
# I could send you these datasets privately.
train_path = 'Data/Semeval/TrainSet/twitter-train-gold-B-2014.tsv'
dev_path = 'Data/Semeval/DevSet/twitter-dev-gold-B.tsv'
#test_path = 'Data/Semeval/TestSet/2014/SemEval2014-task9-test-B-input.txt'
#test_path = '/Users/stroypet/PycharmProjects/EmotionTweetClassifier_3412260/Data/Semeval/TestSet/2014/JOINED_GOLD_TEST_DATA.txt'
test_path = '/Users/stroypet/PycharmProjects/EmotionTweetClassifier_3412260/TweetVisualization/tweets.txt'
#test_path = '/Users/stroypet/PycharmProjects/EmotionTweetClassifier_3412260/ML-PRUNED-NEUTRALS.txt'
# TestSet from 2013
# test_path = 'Data/Semeval/TestSet/2013/twitter-test-GOLD-B.tsv'
print('Reading Datasets and Pre-processing...')
# read the data in the format used by the library
semeval = SemevalTwitter(train_path, dev_path, test_path)
trainset = semeval.trainset
devset = semeval.devset
testset = semeval.testset
# Training the supervised model. You should send (tweet_message, label) for training
print('Training...')
tweets = [(tweet['MESSAGE'], tweet['SENTIMENT']) for tweet in trainset]
tweets += [(tweet['MESSAGE'], tweet['SENTIMENT']) for tweet in devset]
classifier = TwitterHybridClassifier(tweets)
# Apply the classifier for all tweets in the testset
print('Testing...')
# count how many instances were classified by each method
# RB: Rule-based, LB: Lexicon-base, ML: Machine Learning classifier IN THIS ORDER
count = {'RB': 0, 'LB': 0, 'ML': 0}
# Evaluate if tested with the gold standard
guess = list()
gold = list()
guessrb = list()
goldrb = list()
guesslb = list()
goldlb = list()
guessml = list()
goldml = list()
# Keep the predictions string
output = ''
# Load test set tweets, USED DEVSET SO TWEETS IN CM WORK
#tweets = [tweet['MESSAGE'] for tweet in devset]
tweets = [tweet['MESSAGE'] for tweet in testset]
# Classify each instance in the testset in the TwitterHybridClassifier loaded before
predictions = classifier.classify_batch(tweets)
# Output the semeval prediction file and the evaluation variables
# if testset if provided NOTE: USED DEVSET HERE OTHERWISE CM BREAKS
if len(testset) > 0:
for index, tweet in enumerate(testset):
prediction, method = predictions[index]
count[method] += 1
output += tweet['SID'] + '\t' + tweet['UID'] + '\t' + prediction + '\t' + tweet['MESSAGE'] + '\n'
if method == 'RB':
guessrb.append(prediction)
goldrb.append(tweet['SENTIMENT'])
if method == 'LB':
guesslb.append(prediction)
goldlb.append(tweet['SENTIMENT'])
if method == 'ML':
guessml.append(prediction)
goldml.append(tweet['SENTIMENT'])
guess.append(prediction)
gold.append(tweet['SENTIMENT'])
print("Hybrid Classifier:")
confusion_matrix(gold, guess)
print("Rule (hash) Classifier:")
confusion_matrix(goldrb, guessrb)
print("Lexicon Classifier:")
confusion_matrix(goldlb, guesslb)
print("Machine learning Classifier:")
confusion_matrix(goldml, guessml)
# READ IN TESTSET GOLD DATA HERE, CAN OVERWRITE LINE.SPLIT[2] OF GOLD INPUT WITH TESTSET AND RENAME.
# Write Semeval output file
output_file = 'task9-NILC_USP-B-twitter-constrained.output'
codecs.open(output_file, 'w', 'utf8').write(output)
# Print some statistics
print('Statistics - Number of instances processed by each method')
print('Rule Based: ', count['RB'])
print('Lexicon Based: ', count['LB'])
print('Machine Learning: ', count['ML'])
print('\nSemeval output file in: ', output_file)
# Print individual scores
tweets = [(tweet['MESSAGE'],tweet['SENTIMENT']) for tweet in testset] #devset used here
classifier.output_individual_scores(tweets)