-
Notifications
You must be signed in to change notification settings - Fork 5
/
common.py
184 lines (141 loc) · 6.25 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
author: Vered Shwartz
Functions to get tweets from the Twitter API and expand the dataset
Twitter code is adapted from: https://gist.github.com/emaadmanzoor
"""
import re
import os
import math
import time
import codecs
import twitter
def get_tweets(tweet_ids, consumer_key, consumer_secret, access_token, access_token_secret, nlp):
"""
Expands tweets from Twitter
:param tweet_ids: the list of tweet IDs to expand
:return: a dictionary of tweet ID to tweet text
"""
# Save tweets in a temporary file, in case the script stops working and re-starts
tweets = {}
if os.path.exists('tweet_temp'):
with codecs.open('tweet_temp', 'r', 'utf-8') as f_in:
lines = [tuple(line.strip().split('\t')) for line in f_in]
tweets = { tweet_id : tweet for (tweet_id, tweet) in lines }
api = twitter.Api(consumer_key=consumer_key, consumer_secret=consumer_secret, access_token_key=access_token,
access_token_secret=access_token_secret)
[sleeptime, resettime] = reset_sleep_time(api)
with codecs.open('tweet_temp', 'a', 'utf-8') as f_out:
for tweet_id in tweet_ids:
# We didn't download this tweet yet
if not tweet_id in tweets:
try:
curr_tweet = api.GetStatus(tweet_id, include_entities=False)
tweets[tweet_id] = clean_tweet(' '.join([t.lower_ for t in nlp(curr_tweet.text)]))
except twitter.TwitterError as err:
error = str(err)
# If the rate limit exceeded, this script should be stopped and resumed the next day
if 'Rate limit exceeded' in error:
raise
# Other error - the tweet is not available :(
print 'Error reading tweet id:', tweet_id, ':', error
tweets[tweet_id] = 'TWEET IS NOT AVAILABLE'
print >> f_out, '\t'.join((tweet_id, tweets[tweet_id]))
time.sleep(sleeptime)
if time.time() >= resettime:
[sleeptime, resettime] = reset_sleep_time(api)
return tweets
def reset_sleep_time(api):
"""
Sleep between API requests (from: https://gist.github.com/emaadmanzoor)
:param api: twitter API
:return: the sleep time and reset time
"""
rate_limit_stats = api.GetRateLimitStatus()
show_status_limits = rate_limit_stats['resources']['statuses']['/statuses/show/:id']
hits_remaining = show_status_limits['remaining']
reset_time = show_status_limits['reset']
sleep_time = int(math.ceil((int(reset_time) - time.time()) * 1.0 / int(hits_remaining))) if hits_remaining > 0 else 5
print 'Sleeping', sleep_time, 'seconds between API hits until', reset_time
return sleep_time, reset_time
def load_resource(resource_file):
"""
Load a resource from a file
:param resource_file: the resource file
:return: the resource
"""
with codecs.open(resource_file, 'r', 'utf-8') as f_in:
resource = [tuple(line.strip().split('\t')) for line in f_in]
return resource
def get_tweet_ids(resource):
"""
Returns all the tweet IDs in the resource
:param resource: the resource
:return: all the tweet IDs in the resource
"""
tweet_ids = set([tweet_id1 for (tweet_id1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sf_pred2, pred2, sent2_a0, sent2_a1) in resource] + \
[tweet_id2 for (tweet_id1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sf_pred2, pred2, sent2_a0, sent2_a1) in resource])
return tweet_ids
def expand_resource(resource, sent_by_tweet_id):
"""
Add the tweets to the resource
:param resource: the original resource (without tweets)
:param sent_by_tweet_id: dictionary of tweet by tweet ID
:return: the expanded resource (with tweets)
"""
expanded_resource = [(tweet_id1, sent_by_tweet_id[tweet_id1], sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sent_by_tweet_id[tweet_id2], sf_pred2, pred2, sent2_a0, sent2_a1)
for (tweet_id1, sf_pred1, pred1, sent1_a0, sent1_a1,
tweet_id2, sf_pred2, pred2, sent2_a0, sent2_a1) in resource]
return expanded_resource
def save_to_file(dataset, dataset_file):
"""
Recives a dataset (list of tuples) and a file name and saves the dataset in a tab-separated file.
:param dataset: list of tuples
:param dataset_file: file name
:return:
"""
with codecs.open(dataset_file, 'w', 'utf-8') as f_out:
for item in dataset:
print >> f_out, '\t'.join(item)
def clean_tweet(tweet):
"""
Receives a tweet and removes the hashtags and urls
:param tweet: the original tweet
:return: the cleaned tweet
"""
# Retweet
tweet = tweet.lower()
tweet = re.sub(r'^rt [^\s]+\s?: ', '', tweet)
tokens = tweet.split()
cleaned_tokens = []
for token in reversed(tokens):
# Hashtag
if token.startswith('#'):
# If it is in the end of a sentence / followed only by other hashtags or urls - remove it.
# Else, the hashtag is not in the end of a sentence. Convert it to a regular text, e.g.:
# "#SyrianRefugees should not be allowed. It doesn't take a rocket scientist to figure that out..." =>
# "Syrian refugees should not be allowed. It doesn't take a rocket scientist to figure that out..."
if len(cleaned_tokens) > 0:
cleaned_tokens.append(camel_case_split(token))
else:
continue
# URL - remove
elif 't.co' in token or 'http' in token:
continue
# Mention - remove the @ and use it as a name
elif token.startswith('@'):
cleaned_tokens.append(token[1:].lower())
# normal word
else:
cleaned_tokens.append(token.lower())
return ' '.join(reversed(cleaned_tokens))
def camel_case_split(hashtag):
"""
Convert camel case to regular text (for hashtags)
:param hashtag:
:return:
"""
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', hashtag[1:])
return ' '.join([m.group(0).lower() for m in matches])