forked from stroypet/EmotionTweetClassifier_3412260
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SemevalTwitter.py
129 lines (115 loc) · 5.15 KB
/
SemevalTwitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########################################################################
############## Semeval - Sentiment Analysis in Twitter #################
#########################################################################
####
#### Authors: Pedro Paulo Balage Filho e Lucas Avanço
#### Version: 2.0
#### Date: 26/03/14
####
# Python 3 compatibility
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import with_statement
from __future__ import unicode_literals
import codecs
import os
#### Class to provide a data reader for Semeval 2014 Task 9 format.
#### The reader was designed for subtask A (sentiment for the twitter message) only.
#### Information about Semeval format can be found at:
#### http://alt.qcri.org/semeval2014/task9/
####
class SemevalTwitter(object):
# Constructor.
def __init__(self,train_path,dev_path,test_path):
# class variables
self.train_path = train_path
self.dev_path = dev_path
self.test_path = test_path
self.trainset = list()
self.devset = list()
self.testset = list()
self.reader()
# read semeval dataset format
# modifies the train, dev and testset class variables
def reader(self):
# Read the trainset
tweets = []
if not os.path.exists(self.train_path):
print('Trainset file not found. However, they are not necessary if provided the training model (file: model_pythonx.pkl)')
else:
fp = codecs.open(self.train_path, 'r', encoding='utf8')
fp = codecs.open(self.train_path, 'r', encoding='utf8')
lines = fp.readlines()
for line in lines:
#for line in fp:
line = line.strip()
tweet_line = line.split('\t')
if len(tweet_line) != 4:
print('Error to read TrainSet. Must have 4 args. Line: ', line)
tweet = {}
tweet['SID'] = tweet_line[0]
tweet['UID'] = tweet_line[1]
sentiment = tweet_line[2][1:-1]
# classes objective and neutral merged as proposed in the task
if sentiment in ['objective','objective-OR-neutral']:
sentiment = 'neutral'
tweet['SENTIMENT'] = sentiment
tweet['MESSAGE'] = tweet_line[3].strip()
if tweet['MESSAGE'] != 'Not Available':
tweets.append(tweet)
self.trainset = tweets
# Read the devset
tweets = []
if not os.path.exists(self.dev_path):
print('Devset file not found. However, they are not necessary if provided the training model (file: model_pythonx.pkl)')
else:
fp = codecs.open(self.dev_path, 'r', encoding='utf8')
lines = fp.readlines()
for line in lines:
#for line in fp:
# corrects the \u and escape characters in the message (only applies
# for the dev and testset provided by the organization
line = line.strip()
line = line.replace('\\\\', '').replace('\\"', '"').replace("\\'", "'").replace('\\u2019', '\'').replace('\\u002c', ',')
tweet_line = line.split('\t')
if len(tweet_line) != 4:
print('Error to read TrialSet. Must have 4 args. Line: ', line)
tweet = {}
tweet['SID'] = tweet_line[0]
tweet['UID'] = tweet_line[1]
sentiment = tweet_line[2]
# classes objective and neutral merged as proposed in the task
if sentiment in ['objective','objective-OR-neutral']:
sentiment = 'neutral'
tweet['SENTIMENT'] = sentiment
tweet['MESSAGE'] = tweet_line[3].strip()
tweets.append(tweet)
self.devset = tweets
# Read the testset
tweets = []
if not os.path.exists(self.test_path):
print('Testset file not found. You should provide this file if you want to replicate Semeval results.')
else:
fp = codecs.open(self.test_path, 'r', encoding='utf8')
lines = fp.readlines()
for line in lines:
#for line in fp:
# corrects the \u and scape caracters in the message (only applies
# for the dev and testset provided by the organization
line = line.strip()
line = line.replace('\\\\', '').replace('\\"', '"').replace("\\'", "'").replace('\\u2019', '\'').replace('\\u002c', ',')
tweet_line = line.split('\t')
if len(tweet_line) != 4:
print('Error to read TestSet. Must have 4 args. Line: ', line)
tweet = {}
tweet['SID'] = tweet_line[0]
tweet['UID'] = tweet_line[1]
sentiment = tweet_line[2]
tweet['SENTIMENT'] = sentiment
tweet['MESSAGE'] = tweet_line[3]
tweets.append(tweet)
self.testset = tweets