# Twitterスクレピング

まずはログイン処理を行う。

In [1]:
import os
import json
from twitter.scraper import Scraper

# cookies.pkl path ./../tmp/cookies.pkl to absolute path
cookies_path = os.path.join(os.path.dirname("__file__"), '..', 'tmp', 'cookies.json')

# load cookies if exist
cookies = None
try:
    with open(cookies_path, 'r') as f:
        cookies = json.load(f)
except:
    pass

if cookies:
    scraper = Scraper(cookies=cookies)


In [2]:
from automation import ThrottleSearch

search = ThrottleSearch(cookies=cookies)
# 15分で50×20件のリクエストが上限
res = search.run(limit=200, retries=3, queries=[
  {'category': 'Latest', 'query': 'フリーランス min_faves:1000 lang:ja'},
  {'category': 'Latest', 'query': 'エンジニア min_faves:1000 lang:ja'},
  {'category': 'Latest', 'query': 'filter:follows min_faves:1000 lang:ja'},
])

In [3]:
tweets = []

for query_result in res:
    for tweet in query_result:
        result = tweet['content']['itemContent']['tweet_results']['result']
        if result['__typename'] == 'Tweet':
            user = result['core']['user_results']['result']
            tweets.append({
                # query
                'query': tweet['query'],
                # tweet id 
                'tweet_id': result['rest_id'],
                # user name
                'user_name': user['legacy']['screen_name'],
                # user description
                'user_description': user['legacy']['description'],
                # follower count
                'user_followers_count': user['legacy']['followers_count'],
                # following count
                'user_friends_count': user['legacy']['friends_count'],
                'text': result['legacy']['full_text'],
                # hashtags
                'hashtags': [hashtag['text'] for hashtag in result['legacy']['entities']['hashtags']],
                # posted date
                'created_at': result['legacy']['created_at'],
                # favorite count
                'favorite_count': result['legacy']['favorite_count'],
                # retweet count
                'retweet_count': result['legacy']['retweet_count'],
                # reply count
                'reply_count': result['legacy']['reply_count'],
            })

In [4]:
# dump tweets to csv
import pandas as pd
df = pd.DataFrame(tweets)

output_path = os.path.join(os.path.dirname("__file__"), '..', 'tweets.csv')

df.to_csv(output_path, index=False)