In [30]:
import json
import tweepy


In [31]:
def load_token(path):
    with open(path, "r") as f:
        json_data = json.load(f)
        AKey = json_data["API Key"]
        ASec = json_data["API Secret Key"]
        AcToken = json_data["Access Token"]
        AcTokenSec = json_data["Access Token Secret"]
    return AKey, ASec, AcToken, AcTokenSec

In [32]:
path = "../../corpus/CorpusMakerAI.json"
AKey, ASec, AcToken, AcTokenSec = load_token(path)

In [33]:
class Tweet:
    def __init__(self, status):
        self.in_reply_to_status_id = status.in_reply_to_status_id
        self.text = status.text
        self.created_at = status.created_at
        self.screen_name = status.user.screen_name
        self.username = status.user.name
        self.user_id = status.user.id

In [34]:
def _in(arg1, arg2, mode="or"):
    result = False
    if isinstance(arg1, list):
        if mode == "and":
            for element in arg1:
                if isinstance(element, str):
                    if element not in arg2:
                        return False
            return True
        # or
        for element in arg1:
            if isinstance(element, str):
                if element in arg2:
                    result = True
                    break
            else:
                if element == arg2:
                    result = True
                    break
    else:
        if isinstance(arg1, str):
            if arg1 in arg2:
                result = True
        else:
            if arg1 == arg2:
                result = True
    return result

In [111]:
import spacy
import re
from datetime import timedelta
import os

class StreamListener(tweepy.StreamListener):

    def __init__(self, api, corpus_path, name):
        super(StreamListener, self).__init__()
        self.nlp =  spacy.load('ja_ginza')
        self.filter_pos = ["名詞", "動詞", "形状詞", "形容詞"]

        self.api = api
        self.lookup_ids = []
        self.reply_dict = dict()
        
        self.corpus_path = corpus_path
        self.name = name
        
        if name in os.listdir(corpus_path):
           
            with open(corpus_path+name, "r") as f:
                self.conversations = json.load(f)
            print("success load:", name) 
        else:
            self.conversations = {"convs":[]}

        self.counts = 0

    def on_status(self, status):
        self.counts += 1
        
        # if self.counts % 100 == 0:
        #     print("crawled...{0}, id_length: {1}".format(self.counts, len(self.lookup_ids)))

        # リプライではない -> NO
        if self.is_status_tweet(status):
            return
        
        # 無効ならば => NO
        if self.is_invalid_tweet(status):
            return
        
        self.lookup_ids.append(status.in_reply_to_status_id)
        self.reply_dict[status.in_reply_to_status_id] = Tweet(status)
        print(".", end='', flush=True)
        if len(self.lookup_ids) >= 50:
            print("\nCalling statuses_lookup API...")
            # 多分これで検索してる
            statuses = self.api.statuses_lookup(self.lookup_ids)

            for status in statuses:
                if self.is_status_tweet(status):
                    continue

                if self.is_invalid_tweet(status):
                    continue

                reply = self.reply_dict[status.id] 
                # リプライ先が同じユーザー？
                if status.user.id == reply.user_id:
                    continue
                
                self.add_conversation(status, reply)
                self.print_conversation(status, reply)
            self.lookup_ids = []
            self.reply_dict = {}

            

    def is_status_tweet(self, status):
        # リプライではないただのツイートか確認
        if status.in_reply_to_status_id is None:
            return True
    
    # いらないツイートか調べる
    def is_invalid_tweet(self, status):
        # print(status.lang )
        if status.lang != "ja":
            # 日本語か確認
            return True
        # print(status.text)
        if "bot" in status.user.screen_name:
            return True

        if re.search(r"https?://", status.text):
            return True

        if re.search(r"#(\w+)", status.text):
            # ハッシュタグ
            return True
        # print(status.text)
        # 複数の相手にリプライしているか？
        tweet = re.sub(r"@([A-Za-z0-9_]+)", "<unk>", status.text)
        if tweet.split().count("<unk>") > 1:
            return True

        # 長いツイートか？
        if len(tweet.replace("<unk>", "")) > 30:
            return True
        
        # 内容語が含まれているか
        if not self._is_contain_POS(status.text):
            return True
        
        # 発話の長さは適当か(形態素が6つ以上)


        return False
    
    def _is_contain_POS(self, text):
        doc = self.nlp(text, disable=['ner'])
        if len(doc) < 7:
            return False
        for token in doc:
            if "助動詞" not in token.tag_ and _in(self.filter_pos, token.tag_):
                return True
        return False
    
    # def _is_morpheme_len(self, text):

    
    def cleanup_text(self, status):
        text = re.sub(r"@([A-Za-z0-9_]+) ", "", status.text)
        text = re.sub("\s+", ' ', text).strip()
        return text.replace("&gt;", ">").replace("&lt;", "<").replace("&amp;", "&")
    

    def print_conversation(self, reply1, reply2):
        print('------------ 会話 ------------')
        print("reply1:@{}({}): {}".format(
            reply1.user.screen_name,
            reply1.created_at + timedelta(hours=+9),
            reply1.text)
        )
        print("reply2:@{}({}): {}".format(
            reply2.screen_name,
            reply2.created_at + timedelta(hours=+9),
            reply2.text)
        )
        print('-'*30)

    def add_conversation(self, reply1, reply2):
        reply1 = self.cleanup_text(reply1)
        reply2 = self.cleanup_text(reply2)
        data = {
            "rep1": reply1,
            "rep2": reply2
            }
        self.conversations["convs"].append(data)
    

    def save_conv(self):
        with open(self.corpus_path+self.name, "w") as f:
            json.dump(self.conversations, f, ensure_ascii=False, indent=4)


In [112]:
auth = tweepy.OAuthHandler(AKey, ASec)
auth.set_access_token(AcToken, AcTokenSec)
api = tweepy.API(auth)

In [113]:
out_path = "../../corpus/twitter/"
name = "conv.json"
listener = StreamListener(api, out_path, name)
streaming = tweepy.Stream(auth, listener)

listener._is_contain_POS("観光地の食べ物は高いですもんね")

success load: conv.json


True

In [114]:
while True:
    try:
        streaming.sample()
    except KeyboardInterrupt:
        streaming.disconnect()
        break
    except Exception as e:
        streaming.disconnect()
        print(e)

..................................................
Calling statuses_lookup API...
------------ 会話 ------------
reply1:@majokko_sary(2021-08-28 22:21:13): @conveniteacher ゲラッゲラッゲラッしか分からないから、勉強してきます！！笑
reply2:@conveniteacher(2021-08-28 22:22:21): @majokko_sary 古すぎたかあー😁
------------------------------
------------ 会話 ------------
reply1:@yadoramu0312(2021-08-28 17:22:37): @m8BoFMJKzQV7wHP 勇者組む予定だったんですね😓
reply2:@m8BoFMJKzQV7wHP(2021-08-28 22:22:31): @yadoramu0312 全部組みます(꒪˙꒳˙꒪ )👍
------------------------------
------------ 会話 ------------
reply1:@48HyuN(2021-08-28 21:12:07): @N4659S キッズルーム広いなぁ。
今度いってみよーっと。
reply2:@N4659S(2021-08-28 22:22:36): @48HyuN 広いけど何もないんよ🤣
でも快適！❤️
------------------------------
------------ 会話 ------------
reply1:@__msy97(2021-08-28 22:11:37): @ini__kimu さか というのお好きなように🙆🏻⸝⋆
reply2:@ini__kimu(2021-08-28 22:22:39): @__msy97 さかちゃんて呼びますね🎶
------------------------------
------------ 会話 ------------
reply1:@dokurin08686346(2021-08-28 22:21:32): @ghcFf47Rfqg9O8o 明日もお仕事なのね🥺
おやす

In [110]:
listener.conversations

{'convs': []}

In [115]:

listener.save_conv()

In [97]:
with open(out_path+"conv.json", "w") as f:
    json.dump(listener.conversations, f, ensure_ascii=False, indent=4)

In [98]:
listener._is_contain_POS("ありがとうございます😎")

True

In [99]:
nlp =  spacy.load('ja_ginza')


In [105]:
doc = nlp("ホニャラホニャ")
for token in doc:
    print(token, token.tag_)

ホニャラホニャ 名詞-普通名詞-一般
