### Setup

In [4]:
import os
import random
from typing import List
import twscrape

### Case 1: Scraping general tweet set for given year

In [15]:
def set_up_directories(keyword_eng: str, year: str):
    dir_path = year if keyword_eng == "" else keyword_eng + "_" + year
    os.mkdir(dir_path)

    # single digit month
    for i in range(1,10):
        path = dir_path + "/" + "0" + str(i)
        os.mkdir(path) 

    # double digit month
    for j in range(10,13):
        path = dir_path + "/" + str(j)
        os.mkdir(path) 

In [14]:
set_up_directories("", "2015")

In [4]:
# remove directories if needed
# !rm -rf test_2022/

In [7]:
# scrape randomly sampled tweets for 10 days in given month
def scrape_month_sampled_tweets(year: str, months: List[str], days_in_month: int, keyword_jp: str, keyword_eng: str):
    range_days = list(range(1,days_in_month+1))
    range_times = list(range(0,24))
    dir_path = year if keyword_eng == "" else keyword_eng + "_" + year

    # sample 10 random days and times of day for each month
    for m in months: 
        month_path = dir_path + "/" + m
        days = sorted(random.sample(range_days, k=10))   # random days of month (no replacement)
        times = random.choices(range_times, k=10)   # random hours of day (replacement)
        
        # scrape tweets for the 10 picked days and times
        for t in range(10):     
            day = "0" + str(days[t]) if days[t] < 10 else str(days[t])
            time = "0" + str(times[t]) if times[t] < 10 else str(times[t])
            day_path = month_path + "/" + year + "-" + m + "-" + day + ".txt"
            command = 'twscrape search "' + keyword_jp + ' since:' + year + '-' + m + '-' + day + '_' + time + ':00:00_UTC lang:ja"　> ' + day_path + ' --limit=4500'
            os.system(command)

In [8]:
# scrape random sample of tweets (general content or keyword-search) from given year
def scrape_year_sampled_tweets(year: str, keyword_jp="", keyword_eng=""):
    # 28 day months
    scrape_month_sampled_tweets(year,["02"], 28, keyword_jp, keyword_eng)    # omit leap year 29th days for simplicity...?

    # 30 day months
    months_30 = ["04","06","09","11"]
    scrape_month_sampled_tweets(year,months_30, 30, keyword_jp, keyword_eng)

    # 31 day months
    months_31 = ["01","03","05","07","08","10","12"]
    scrape_month_sampled_tweets(year,months_31, 31, keyword_jp, keyword_eng)

In [9]:
def concatenate_general_txt_files(year: str, keyword_eng=""):
  # concatenate .txt files into one file per month
  for root, dirs, files in os.walk("./" + year):
      for name in dirs:
        month_path = os.path.join(root, name)
        os.system("cat " + month_path + "/*.txt > " + name + ".txt")
  
  # concatenate month .txt files into one file for the year
  os.system("cat " + year + "-*.txt > " + keyword_eng + "_" + year + "-all.txt")

In [None]:
scrape_year_sampled_tweets("2015")
concatenate_general_txt_files("2015")

In [None]:
scrape_year_sampled_tweets("2022")
concatenate_general_txt_files("2022")

### Case 2: Scraping minority keyword-related tweet set

In [13]:
# scrape tweets containing keyword from every month of given year
def scrape_keyword_month_tweets(year: str, months: List[str], days_in_month: int, keyword_jp: str, keyword_eng: str):
    for m in months: 
        command = 'twscrape search "' + keyword_jp + ' since:' + year + '-' + m + '-01_00:00:00_UTC until:' + year + '-' + m + '-' + str(days_in_month) + '_23:59:59_UTC lang:ja"　> ' + keyword_eng + '_' + year + '/' + m + '.txt'
        os.system(command)

In [21]:
# scrape all tweets containing keyword in given year
def scrape_keyword_sampled_tweets(year: str, keyword_jp: str, keyword_eng: str):
    os.mkdir(keyword_eng + "_" + year) 

    # 28 day months
    scrape_keyword_month_tweets(year,["02"], 28, keyword_jp, keyword_eng)    # omit leap year 29th days for simplicity...?

    # 30 day months
    months_30 = ["04","06","09","11"]
    scrape_keyword_month_tweets(year,months_30, 30, keyword_jp, keyword_eng)

    # 31 day months
    months_31 = ["01","03","05","07","08","10","12"]
    scrape_keyword_month_tweets(year,months_31, 31, keyword_jp, keyword_eng)

In [9]:
def concatenate_keyword_txt_files(year: str, keyword_eng=""):
  # concatenate month .txt files into one file for the year
  os.system("cat " + keyword_eng + "_" + year + "/" + "*.txt > " + keyword_eng + "_" + year + ".txt")

In [None]:
# zainichi korean set 2022
scrape_keyword_sampled_tweets("2022", "在日コリアン", "zainichi")
concatenate_keyword_txt_files("2022", "zainichi")

In [16]:
# zainichi korean set 2015
os.mkdir("zainichi_2015")
scrape_keyword_sampled_tweets("2015", "在日コリアン", "zainichi")
concatenate_keyword_txt_files("2015", "zainichi")

2024-03-21 12:37:43.566 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 12:44:36
2024-03-21 12:44:39.506 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-21 12:49:34.908 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 12:49:42
2024-03-21 12:49:44.950 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_research on queue SearchTimeline
2024-03-21 12:50:41.514 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 12:50:44
2024-03-21 12:50:46.531 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account nlp542381374579 on queue SearchTimeline
2024-03-21 12:52:46.032 | INFO     | twscrape.accounts_

In [None]:
# ainu 2022
os.mkdir("ainu_2022")
scrape_keyword_sampled_tweets("2022", "アイヌ", "ainu")
concatenate_keyword_txt_files("2022", "ainu")

In [None]:
# ainu 2015
os.mkdir("ainu_2015")
scrape_keyword_sampled_tweets("2015", "アイヌ", "ainu")
concatenate_keyword_txt_files("2015", "ainu")

In [None]:
# ryukyu
os.mkdir("ryukyujin_2022")
scrape_keyword_sampled_tweets("2022", "琉球人", "ryukyujin")
concatenate_keyword_txt_files("2022", "ryukyujin")

In [None]:
# haafu
os.mkdir("haafu_2_2022")
scrape_keyword_sampled_tweets("2022", "ハーフ", "haafu_2")
concatenate_keyword_txt_files("2022", "haafu_2")

In [19]:
# haafu
os.mkdir("haafu_2015")
scrape_keyword_sampled_tweets("2015", "ハーフ", "haafu")
concatenate_keyword_txt_files("2015", "haafu")

2024-03-21 13:27:36.813 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 13:29:46
2024-03-21 13:29:47.238 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-21 13:30:39.229 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 13:30:47
2024-03-21 13:30:49.245 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_3 on queue SearchTimeline
2024-03-21 13:31:38.390 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 13:31:43
2024-03-21 13:31:48.591 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_4 on queue SearchTimeline
2024-03-21 13:32:37.564 | INFO     | twscrape.accounts_pool:g

In [3]:
os.system("cat haafu_2015/" + "*.txt > haafu_2015_2.txt")

0

In [20]:
scrape_keyword_sampled_tweets("2015", "ハーフ", "haafu")

2024-03-21 19:55:59.274 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 20:05:45
2024-03-21 20:05:50.726 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-21 20:09:04.404 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 20:09:05
2024-03-21 20:09:09.417 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_research on queue SearchTimeline
2024-03-21 20:11:02.428 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 20:20:51
2024-03-21 20:20:54.000 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-21 20:22:47.365 | INFO     | twscrape.accounts_poo

In [11]:
# okinawajin
os.mkdir("okinawajin_2022")
scrape_keyword_sampled_tweets("2022", "沖縄人", "okinawajin")
concatenate_keyword_txt_files("2022", "okinawajin")

2024-03-17 21:36:42.438 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 21:46:33
2024-03-17 21:46:38.481 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-17 21:51:46.588 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 22:01:39
2024-03-17 22:01:42.821 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-17 22:07:06.737 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 22:16:43
2024-03-17 22:16:47.742 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_2 on queue SearchTimeline
2024-03-17 22:18:05.182 | INFO     | twscrape.accounts_pool:g

In [None]:
# gaijin
os.mkdir("gaijin_2022")
scrape_keyword_sampled_tweets("2022", "外人", "gaijin")
concatenate_keyword_txt_files("2022", "gaijin")