generated from wanghaisheng/keywords-topic-tweets-scraper-monitor
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_scraper.py
60 lines (47 loc) · 1.96 KB
/
twitter_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import datetime
import os
# import twint
import pathlib
def sns_scrape(keyword,DATE_START,JSON_FILENAME):
os.system(f'snscrape --jsonl --progress --since {DATE_START} twitter-hashtag "{keyword}" > {JSON_FILENAME}.json')
# with end date
# os.system(f'snscrape --jsonl --progress --since {DATE_START} twitter-hashtag "{HASHTAG} until:{DATE_END}" > {JSON_FILENAME}.json')
def scrape_twint():
c = twint.Config()
# c.Until = str(datetime.datetime.today().date() + datetime.timedelta(days=1))
c.Since = str(datetime.datetime.today().replace(hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1))
# c.Username = "test"
c.Search = "#depression"
c.Location=True
c.Images = True
# c.Limit = 50
# c.Custom["user"] = ["id", "tweet", "user_id", "username", "hashtags", "mentions"]
c.User_full = True
c.Store_csv = True
c.Output = "test3.csv"
c.Debug = True
twint.run.Search(c)
if __name__ == "__main__":
hashtags = os.getenv('hashtags').strip()
outdir = os.getenv('outdir').strip()
print('input hashtags ',hashtags)
keywords=[]
if ',' in hashtags:
keywords = hashtags.split(',')
print('keywords queues1 ',keywords)
else:
print('keywords queues2 ',keywords)
keywords.append(hashtags)
print('keywords queues2 ',keywords)
DATE_START = str(datetime.datetime.today().date() - datetime.timedelta(days=1))
DATA_PATH = pathlib.Path(outdir.replace('/','')+"/")
DATA_PATH.mkdir(parents=True, exist_ok=True)
# MAX_RESULT = 100
# DATE_END = '2020-05-08'
JSON_FILENAME = DATA_PATH / str(datetime.datetime.today().date())
for keyword in keywords:
sns_scrape(keyword,DATE_START,JSON_FILENAME)
# reference
# https://betterprogramming.pub/how-to-scrape-tweets-with-snscrape-90124ed006af
# https://github.com/hansheng0512/tweets-scrapping-using-python
# https://github.community/t/can-github-actions-directly-edit-files-in-a-repository/17884/7