In [1]:
import pandas as pd
import requests
import json
import re
import ast
import datetime
import time

In [None]:
def create_url(QUERY, MAX_RESULTS):
    # クエリ条件：指定のワードを含む、リツイートを除く、botと思われるユーザーのツイートを除く
    query = QUERY
    tweet_fields = "tweet.fields=author_id,id,text,created_at"
    max_results = MAX_RESULTS
    url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}".format(
        query, tweet_fields, max_results
    )
    return url


In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
def connect_to_endpoint(url, headers):
    response = requests.request("GET", url, headers=headers)
#     print('status code:', str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


In [None]:
def get_tweet(BEARER_TOKEN, MAX_RESULTS, QUERY):
    bearer_token = BEARER_TOKEN
    url = create_url(QUERY, MAX_RESULTS)
    headers = create_headers(bearer_token)
    json_response = connect_to_endpoint(url, headers)
    json_dumps = json.dumps(json_response, indent=4, sort_keys=True)
    return ast.literal_eval(re.sub('\\n\s+', '', json_dumps))

In [None]:
def utc_to_jst(timestamp_utc):
    datetime_utc = datetime.datetime.strptime(timestamp_utc + "+0000", "%Y-%m-%d %H:%M:%S.%f%z")
    datetime_jst = datetime_utc.astimezone(datetime.timezone(datetime.timedelta(hours=+9)))
    timestamp_jst = datetime.datetime.strftime(datetime_jst, '%Y-%m-%d %H:%M:%S')
    return timestamp_jst


In [None]:
def shape_data(data):
    for i, d in enumerate(data):
        # URLの削除
        data[i]['text'] = re.sub('[ 　]https://t\.co/[a-zA-Z0-9]+', '', d['text'])
        # ユーザー名の削除
        data[i]['text'] = re.sub('[ 　]?@[a-zA-Z0-9_]+[ 　]', '', d['text'])
        # 絵文字の除去
        data[i]['text'] = d['text'].encode('cp932',errors='ignore').decode('cp932')
#         # ハッシュタグの削除
#         data[i]['text'] = re.sub('#.+ ', '', d['text'])
        # 全角スペース、タブ、改行を削除
        data[i]['text'] = re.sub(r"[\u3000\t\n]", "", d['text'])
        # 日付時刻の変換（UTCからJST）
        data[i]['created_at'] = utc_to_jst(d['created_at'].replace('T', ' ')[:-1])
    return data

In [None]:
BEARER_TOKEN = "[BEARER_TOKEN]"
MAX_RESULTS = "max_results=100" # A number between 10 and 100.

TARGET_WORDS = [
    "トヨタ",
    "日産"
]
QUERY_CONDITIONS = [
    " -is:retweet -(from:HOGE OR from:FUGA)",
    " -is:retweet -(from:FOO OR from:BAR)"
]

df = pd.DataFrame()
iterator, request_iterator = 0, 0


In [None]:
# クエリのlistが終わるまでAPIを叩く
for target_word, query_ in zip(TARGET_WORDS, QUERY_CONDITIONS):
    next_token = ''
    break_flag = False
    # 次ページがなくなるまで次ページのクエリを取得
    while True:
        try:
            data['meta']['next_token']
        except KeyError: # 次ページがない(next_tokenがない)場合はループを抜ける
            del data
            break_flag = True
        except NameError: # TARGET_WORDS内の各要素で初めてAPIを取得するとき
            query = query_
        else: # 2ページめ以降の処理
            next_token = data['meta']['next_token']
            query = query_ + '&next_token=' + next_token
        finally:
            if break_flag == True: break
            QUERY = '{}{}'.format(target_word, query)
            data = get_tweet(BEARER_TOKEN, MAX_RESULTS, QUERY)
            temp_df = pd.DataFrame(shape_data(data['data']))
            temp_df[target_word] = True
            df = pd.concat([df, temp_df])

            iterator += data['meta']['result_count']

            request_iterator += 1
            if request_iterator >= 180: # 180requestを超えたら止める
                print('180リクエストを超えるため、15分間停止します...')
                time.sleep(15.01*60) # 15分間（余裕をみてプラス1秒弱）中断
                request_iterator = 0


In [None]:
print(str(iterator) + '件取得しました。')
df.reset_index(drop=True, inplace=True)
df.to_pickle('./raw_tweetlog.pkl')