In [None]:
import tweepy
from twarc import Twarc2, expansions
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import nltk
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
from nltk.stem.porter import PorterStemmer as PS
import time
import schedule
import datetime

In [None]:
config = pd.read_csv("./config_academic.csv")

In [None]:
# Twitter API config
twitterApiKey = config['twitterApiKey'][0]
twitterApiSecret = config['twitterApiSecret'][0]
twitterApiAccessToken = config['twitterApiAccessToken'][0]
twitterApiAccessTokenSecret = config['twitterApiAccessTokenSecret'][0]
twitterAPIBearerToken=config['twitterApiBearerToken'][0]

In [None]:
# Authenticate
auth = tweepy.OAuthHandler(twitterApiKey, twitterApiSecret)
auth.set_access_token(twitterApiAccessToken, twitterApiAccessTokenSecret)
client = tweepy.Client(auth, wait_on_rate_limit = True)

In [None]:
# キーワードからツイートを取得
search_word='btc'
env='sentienv'
since='202105010000'
until='202105020000'
numTweets=500000

tweets = tweepy.Cursor(api.search_full_archive,  
                        query=search_word,
                        environment_name=env,
                        fromDate=since,
                        toDate=until
                        ).items(numTweets);


In [None]:
#ツイートの配列
t_data = []
for tweet in tweets:
    t_data.append([tweet.user.name,'@'+tweet.user.screen_name,tweet.user.friends_count,tweet.user.followers_count,tweet.text.replace('\n',''),tweet.favorite_count,tweet.retweet_count,tweet.created_at])
    
print("{}件収集\n".format(len(t_data)))    

In [None]:
df = pd.DataFrame(data=t_data, columns=['Name','User Name','Following','Followers','Tweet','Favorite','RT','created at'])
df.head(10)

In [None]:
df.tail(10)

In [None]:
#中身の確認
for tweet in tweepy.Cursor(api.search_full_archive, query='緊急事態宣言',environment_name=env,).items(10):
    print(tweet)


In [None]:
df.to_csv("test.csv")

In [None]:
#lemmatization
import stanza
stanza.download('en') # download English model
nlp = stanza.Pipeline('en') # initialize English neural pipeline
doc = nlp("Barack Obama was born in Hawaii.") # run annotation over a sentence

In [None]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
doc = nlp('Her bags are better than his.')
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

In [None]:
#lemmatization
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def lemma(txt):
    s = ''
    doc = nlp(txt)
    
    for sent in doc.sentences:
        for word in sent.words:
            s += word.lemma + ' '
    
    return s

In [None]:
df['exTweet'] = df['Tweet'].apply(lemma)
df.to_csv("test.csv")
df.head(10)

In [None]:
import time
import schedule
import datetime

def job():
    #検索ワード、条件
    s='btc min_faves:5 -filter:retweets'
    
    #5日前と4日前の時刻を取得
    dt  = datetime.date.today()
    dt5 = dt + datetime.timedelta(days=-5)
    dt4 = dt + datetime.timedelta(days=-4)

    tweets = tweepy.Cursor(api.search,  
                           q=s,
                           exclude_replies=True,
                           tweet_mode='extended',
                           lang = 'en',
                           since= dt5,
                           until= dt4,
                          ).items(10);
    
    #ツイートのリスト
    t_data = []
    for tweet in tweets:
        t_data.append([tweet.user.name,
                       '@'+tweet.user.screen_name,
                       tweet.user.friends_count,
                       tweet.user.followers_count,
                       tweet.full_text.replace('\n',''),
                       tweet.favorite_count,
                       tweet.retweet_count,
                       tweet.created_at])


              
    #処理前ツイート出力
    df = pd.DataFrame(data=t_data, columns=['Name','User Name','Following','Followers','Tweet','Favorite','RT','created at'])
    
    df.to_csv(f'./tweet-of-btc/{dt5}-{dt4}.csv')

    #件数確認
    print('{} ~ {}：{}件\n'.format(dt5,dt4,len(t_data)))

In [None]:
def main():
    schedule.every(1).minutes.do(job)

    while True:
        schedule.run_pending()
        time.sleep(1)
        
if __name__ == "__main__":
    main()