In [1]:
import os
import random
from typing import List
import twscrape

In [2]:
def set_up_directories(keyword_eng: str, year: str):
    os.mkdir(keyword_eng + "_" + year)

    # single digit month
    for i in range(1,10):
        path = keyword_eng + "_" + year + "/" + "0" + str(i)
        os.mkdir(path) 

    # double digit month
    for j in range(10,13):
        path = keyword_eng + "_" + year + "/" + str(j)
        os.mkdir(path) 

In [5]:
set_up_directories("zainichi", "2022")

In [4]:
# remove directories if needed
!rm -rf zainichi_2022/

In [3]:
# scrape randomly sampled tweets for 10 days in given month
def scrape_month_sampled_tweets(year: str, months: List[str], days_in_month: int, keyword_jp: str, keyword_eng: str):
    range_days = list(range(1,days_in_month+1))
    range_times = list(range(0,24))
    dir_path = year if keyword_eng == "" else keyword_eng + "_" + year

    # sample 10 random days and times of day for each month
    for m in months: 
        month_path = dir_path + "/" + m
        days = sorted(random.sample(range_days, k=10))   # random days of month (no replacement)
        times = random.choices(range_times, k=10)   # random hours of day (replacement)
        
        # scrape tweets for the 10 picked days and times
        for t in range(10):     
            day = "0" + str(days[t]) if days[t] < 10 else str(days[t])
            time = "0" + str(times[t]) if times[t] < 10 else str(times[t])
            day_path = month_path + "/" + year + "-" + m + "-" + day + ".txt"
            command = 'twscrape search "' + keyword_jp + ' since:' + year + '-' + m + '-' + day + '_' + time + ':00:00_UTC lang:ja"　> ' + day_path + ' --limit=4500'
            # os.system(command)
            print(command)

In [6]:
# scrape random sample of tweets (general content or keyword-search) from given year
def scrape_year_sampled_tweets(year: str, keyword_jp="", keyword_eng=""):
    # 28 day months
    scrape_month_sampled_tweets(year,["02"], 28, keyword_jp, keyword_eng)    # omit leap year 29th days for simplicity...?

    # 30 day months
    months_30 = ["04","06","09","11"]
    scrape_month_sampled_tweets(year,months_30, 30, keyword_jp, keyword_eng)

    # 31 day months
    months_31 = ["01","03","05","07","08","10","12"]
    scrape_month_sampled_tweets(year,months_31, 31, keyword_jp, keyword_eng)

In [7]:
def concatenate_general_txt_files(year: str, keyword_eng=""):
  # concatenate .txt files into one file per month
  for root, dirs, files in os.walk("./" + year):
      for name in dirs:
        month_path = os.path.join(root, name)
        os.system("cat " + month_path + "/*.txt > " + name + ".txt")
  
  # concatenate month .txt files into one file for the year
  os.system("cat " + year + "-*.txt > " + keyword_eng + "_" + year + "-all.txt")

In [None]:
scrape_year_sampled_tweets("2022")
concatenate_general_txt_files("2022")

In [8]:
# scrape randomly sampled tweets for 10 days in given month
def scrape_keyword_month_tweets(year: str, months: List[str], days_in_month: int, keyword_jp: str, keyword_eng: str):
    dir_path = year if keyword_eng == "" else keyword_eng + "_" + year

    # sample 10 random days and times of day for each month
    for m in months: 
        command = 'twscrape search "' + keyword_jp + ' since:' + year + '-' + m + '-01_00:00:00_UTC until:' + year + '-' + m + '-' + str(days_in_month) + '_23:59:59_UTC lang:ja"　> ' + keyword_eng + '_' + year + '/' + m + '.txt'
        os.system(command)

In [9]:
# scrape all tweets containing keyword in given year
def scrape_keyword_sampled_tweets(year: str, keyword_jp: str, keyword_eng: str):
    # os.mkdir(keyword_eng + "_" + year) 

    # 28 day months
    scrape_keyword_month_tweets(year,["02"], 28, keyword_jp, keyword_eng)    # omit leap year 29th days for simplicity...?

    # 30 day months
    months_30 = ["04","06","09","11"]
    scrape_keyword_month_tweets(year,months_30, 30, keyword_jp, keyword_eng)

    # 31 day months
    months_31 = ["01","03","05","07","08","10","12"]
    scrape_keyword_month_tweets(year,months_31, 31, keyword_jp, keyword_eng)

In [10]:
def concatenate_keyword_txt_files(year: str, keyword_eng=""):
  # concatenate month .txt files into one file for the year
  os.system("cat " + keyword_eng + "_" + year + "/" + "*.txt > " + keyword_eng + "_" + year + ".txt")

In [11]:
# zainichi korean set
scrape_keyword_sampled_tweets("2022", "在日コリアン", "zainichi")

2024-03-12 20:42:43.213 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 20:51:39
2024-03-12 20:54:21.106 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-12 22:42:49.096 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 22:49:23
2024-03-12 22:49:24.743 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_3 on queue SearchTimeline
2024-03-12 22:50:06.204 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 22:50:15
2024-03-12 22:50:16.225 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_4 on queue SearchTimeline
2024-03-12 22:52:07.815 | INFO     | twscrape.accounts_pool:g

In [13]:
concatenate_keyword_txt_files("2022", "zainichi")

In [18]:
# ainu
os.mkdir("ainu_2022")
scrape_keyword_sampled_tweets("2022", "アイヌ", "ainu")
concatenate_keyword_txt_files("2022", "ainu")

2024-03-13 23:54:00.375 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 23:56:34
2024-03-13 23:56:36.562 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-13 23:57:36.571 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 23:58:27
2024-03-13 23:58:31.750 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_3 on queue SearchTimeline
2024-03-14 00:00:18.336 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 00:01:22
2024-03-14 00:02:37.450 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_5 on queue SearchTimeline
2024-03-14 00:09:36.593 | INFO     | twscrape.accounts_pool:g

In [19]:
# ryukyu
os.mkdir("ryukyujin_2022")
scrape_keyword_sampled_tweets("2022", "琉球人", "ryukyujin")
concatenate_keyword_txt_files("2022", "ryukyujin")

Traceback (most recent call last):
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpx/_transports/default.py", line 69, in map_httpcore_exceptions
    yield
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpx/_transports/default.py", line 373, in handle_async_request
    resp = await self._pool.handle_async_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 216, in handle_async_request
    raise exc from None
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 196, in handle_async_request
    response = await connection.handle_async_request(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/p

In [20]:
# haafu
os.mkdir("haafu_2022")
scrape_keyword_sampled_tweets("2022", "ハーフ", "haafu")
concatenate_keyword_txt_files("2022", "haafu")

2024-03-14 23:45:29.275 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 23:51:39
2024-03-14 23:52:05.702 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
Traceback (most recent call last):
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpx/_transports/default.py", line 69, in map_httpcore_exceptions
    yield
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpx/_transports/default.py", line 373, in handle_async_request
    resp = await self._pool.handle_async_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/taylory/.local/share/virtualenvs/thesis-hw276cHk/lib/python3.11/site-packages/httpcore/_async/connection_pool.py", line 216, in handle_async_request
    raise exc from None
  File "/Users/taylory/.loca

In [15]:
import json

In [16]:
# filter out 2024 tweet ads
total = 0
spam = 0
with open('zainichi_2022.txt', 'r') as file:
    for line in file:
        total += 1
        tweet = json.loads(line)
        # if line == None or tweet == None:
        #     not_parsed.append((line, tweet))
        #     print("Parsing error: ", line, tweet)
        # elif tweet['retweetedTweet']:
        #     retweets.append(tweet)
        #     print("Retweet: ", tweet['id'])
        if int(tweet['date'].split("-")[0]) > 2022:
            spam += 1
        # else: 
            
            # tweet_text = tweet['rawContent'] # note: need other prop for over 140 char?
            # # preprocess text
            # processed = preprocess(tweet_text)            
            # # tokenize with mecab
            # components = tokenize(processed)
            # tokens.append(components)
file.close()


In [17]:
print(total)
print(spam)

37483
5582
