In [1]:
import os
import pickle
import pandas as pd
import re
import datetime
from sklearn.preprocessing import LabelEncoder

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 5000)

from dateutil import tz
UTC = tz.gettz("UTC")

In [2]:
def load(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

def dump(value, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        pickle.dump(value, f)

In [3]:
scraping_data_path = '../data/scraping_data/'
dataset_path = '../data/dataset/'
fij_tweet_data = load(scraping_data_path+'fij_news_data/tweet_data.pkl')
infact_tweet_data = load(scraping_data_path+'infact_news_data/tweet_data.pkl')
nikkei_tweet_data = load(scraping_data_path+'nikkei_news_data/tweet_data.pkl')
fij_fake_tweet_text_dict = load(scraping_data_path+'fij_news_data/fake_twitter_text_dict.pkl')
infact_fake_tweet_text_dict = load(scraping_data_path+'infact_news_data/fake_twitter_text_dict.pkl')

In [4]:
def data_cancat(data_path, file_name):
    
    data_dirs = [f for f in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, f))]
    data_list = []
    for data_dir in data_dirs:
        if os.path.isfile(os.path.join(data_path+data_dir, file_name)):
            df = load(os.path.join(data_path+data_dir, file_name))
            data_list.append(df)
    
    return pd.concat(data_list, axis=0)

In [5]:
fij_tweet_data = fij_tweet_data[~fij_tweet_data.duplicated('id')]
infact_tweet_data = infact_tweet_data[~infact_tweet_data.duplicated('id')]
nikkei_tweet_data = nikkei_tweet_data[~nikkei_tweet_data.duplicated('id')]

In [6]:
# fij_tweet_data['label'] = 1
# infact_tweet_data['label'] = 1
# nikkei_tweet_data['label'] = 0

# def original_kw(value, tweet_text_dict):
#     return [k for k, v in tweet_text_dict.items() if value in v][0]

# fij_tweet_data['event'] = fij_tweet_data['kw'].apply(original_kw, tweet_text_dict = fij_fake_tweet_text_dict)
# infact_tweet_data['event'] = infact_tweet_data['kw'].apply(original_kw, tweet_text_dict = infact_fake_tweet_text_dict)
# nikkei_tweet_data['event'] = nikkei_tweet_data['kw']

tweet_data_list = [fij_tweet_data, infact_tweet_data, nikkei_tweet_data]
tweet_kw_list = ['日本', 'go', '2回接種済みだった', '機能停止した', '除染作業', 'と言い出した。', '日本政府は', 'PCR', '大阪の場合は', '病院に入院しているらしい', '各党の第一声', '日本初の死亡者']

df_tweet_data = pd.concat(tweet_data_list, axis=0)
print(len(df_tweet_data))
df_tweet_data = df_tweet_data[~df_tweet_data['kw'].isin(tweet_kw_list)]
print(len(df_tweet_data))
df_tweet_data = df_tweet_data[~df_tweet_data.duplicated('id')]
print(len(df_tweet_data))

df_tweet_data['created_at'] = pd.to_datetime(df_tweet_data['created_at'])
df_tweet_data = df_tweet_data[df_tweet_data['created_at']>datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=UTC)]

print(len(df_tweet_data))

print(df_tweet_data['event'].nunique())

le = LabelEncoder()
le.fit(df_tweet_data['event'])

df_tweet_data['event_id'] = le.transform(df_tweet_data['event'])

event_cnt = df_tweet_data.groupby('event_id', as_index=False)['event'].count().rename(columns={'event':'event_cnt'})
df_tweet_data = df_tweet_data[~df_tweet_data['event_id'].isin(event_cnt['event_cnt']<4)].copy()

print(len(df_tweet_data))

print(df_tweet_data['event'].nunique())

def timestamps_mk(df_tweet_data):
    for key in df_tweet_data['event_id'].unique().tolist():
        base_time = df_tweet_data[df_tweet_data['event_id']==key].sort_values(['id'])[:1]['created_at'].tolist()[0]
        df_tweet_data.loc[df_tweet_data['event_id']==key, 'base_time'] = base_time
    
    return df_tweet_data['created_at'] - df_tweet_data['base_time']

df_tweet_data['timestamps'] = timestamps_mk(df_tweet_data)
df_tweet_data['timestamps'] = df_tweet_data['timestamps'].map(lambda x: x.total_seconds())

def text_extraction(text):
    text = re.sub(r'@\w*', '', text)
    text = re.sub(r'https:.*', '', text)
    
    return text

df_tweet_data['text'] = df_tweet_data['text'].map(text_extraction)

149785
144966
143262
143015
1912
142518
1910




In [7]:
def mk_dataset(df_dataset):
    
    datset_dict = {}
    
    for key in df_tweet_data['event_id'].unique().tolist():
        messages_dict = {}
        messages_dict['id'] = df_tweet_data[df_tweet_data['event_id']==key].sort_values(['id'])['id'].tolist()
        messages_dict['uid'] = df_tweet_data[df_tweet_data['event_id']==key].sort_values(['id'])['author_id'].tolist()
        messages_dict['timestamps'] = df_tweet_data[df_tweet_data['event_id']==key].sort_values(['id'])['timestamps'].tolist()
        messages_dict['text'] = df_tweet_data[df_tweet_data['event_id']==key].sort_values(['id'])['text'].tolist()
        messages_dict['label'] = df_tweet_data[df_tweet_data['event_id']==key]['label'].tolist()[0]
        datset_dict[str(key)] = messages_dict
        
    return datset_dict
    
    
def word_segmentation(text:str, replace_dict:dict, exclude_task_list:list, a):
    
    token_list = []
    for token in a.analyze(text):
        token = token.translate(str.maketrans(replace_dict)) # 邪魔な文字を除く
        if token not in exclude_task_list and not token.isdecimal(): # 意味ないワードを除く
            token_list.append(token)
            token_list = [s for s in token_list if not s.startswith('#')]
            
    token_str = ' '.join(token_list)
    
    return token_str

In [8]:
datset_dict = mk_dataset(df_tweet_data)

replace_dict = {'[':'', ']':'', '/':'', '+':'', '(':'', ')':'', '等':'', ',':'', '.':'', '<':'', '>':'', '-':'', '?':'', ':':'', '|':''}
exclude_task_list = ['new', 'rt', '籏智広太', 'インファクト', 'ファクトチェック', 'factcheck', 'infact', 'こび', 'ナビ']

# analyzerモジュールで形態素分析
char_filters = [UnicodeNormalizeCharFilter()]

tokenizer = Tokenizer()

token_filters = [CompoundNounFilter(),
                 POSStopFilter(['記号']),
                 LowerCaseFilter(),
                 ExtractAttributeFilter('surface')]

a = Analyzer(char_filters=char_filters, tokenizer=tokenizer ,token_filters=token_filters)

for key in datset_dict.keys():
    text_word_segmentation_list = []
    for text in datset_dict[key]['text']:
        text_word_segmentation_list.append(word_segmentation(text, replace_dict, exclude_task_list, a))
    datset_dict[key]['text'] = text_word_segmentation_list

In [9]:
dump(datset_dict, dataset_path+'datset_dict.pkl')
dump(df_tweet_data, dataset_path+'df_tweet_data.pkl')