### Setup

In [2]:
import os
import random
from typing import List
import twscrape

### Case 1: Scraping general tweet set for given year

In [3]:
def set_up_directories(keyword_eng: str, year: str):
    dir_path = year if keyword_eng == "" else keyword_eng + "_" + year
    os.mkdir(dir_path)

    # single digit month
    for i in range(1,10):
        path = dir_path + "/" + "0" + str(i)
        os.mkdir(path) 

    # double digit month
    for j in range(10,13):
        path = dir_path + "/" + str(j)
        os.mkdir(path) 

In [4]:
# scrape randomly sampled tweets for 10 days in given month
def scrape_month_sampled_tweets(year: str, months: List[str], days_in_month: int):
    range_days = list(range(1,days_in_month+1))
    range_times = list(range(0,24))

    # sample 10 random days and times of day for each month
    for m in months: 
        month_path = year + "/" + m
        days = sorted(random.sample(range_days, k=10))   # random days of month (no replacement)
        times = random.choices(range_times, k=10)   # random hours of day (replacement)
        
        # scrape tweets for the 10 picked days and times
        for t in range(10):     
            day = "0" + str(days[t]) if days[t] < 10 else str(days[t])
            time = "0" + str(times[t]) if times[t] < 10 else str(times[t])
            date_string = year + '-' + m + '-' + day
            day_path = month_path + "/" + year + "-" + m + "-" + day + ".txt"
            command = 'twscrape search "since:' + date_string + '_' + time + ':00:00_UTC until:' + date_string + '_' + time + ':59:59_UTC lang:ja"　> ' + day_path + ' --limit=4500'
            os.system(command)

In [5]:
# scrape random sample of tweets (general content or keyword-search) from given year
def scrape_year_sampled_tweets(year: str):
    # 28 day months
    scrape_month_sampled_tweets(year,["02"], 28)    # omit leap year 29th days for simplicity...?

    # 30 day months
    months_30 = ["04","06","09","11"]
    scrape_month_sampled_tweets(year,months_30, 30)

    # 31 day months
    months_31 = ["01","03","05","07","08","10","12"]
    scrape_month_sampled_tweets(year,months_31, 31)

In [6]:
def concatenate_general_txt_files(year: str, keyword_eng=""):
  # concatenate .txt files into one file per month
  for root, dirs, files in os.walk("./" + year):
      for name in dirs:
        month_path = os.path.join(root, name)
        os.system("cat " + month_path + "/*.txt > " + year + "/" + name + ".txt")
  
  # concatenate month .txt files into one file for the year
  os.system("cat " + year + "/*.txt > " + year + "-all.txt")

In [31]:
# last took 1790m
scrape_year_sampled_tweets("2015")
concatenate_general_txt_files("2015")

sh: 2015/02/2015-02-05.txt: No such file or directory
sh: 2015/02/2015-02-06.txt: No such file or directory
sh: 2015/02/2015-02-07.txt: No such file or directory
sh: 2015/02/2015-02-10.txt: No such file or directory
sh: 2015/02/2015-02-11.txt: No such file or directory
sh: 2015/02/2015-02-15.txt: No such file or directory
sh: 2015/02/2015-02-18.txt: No such file or directory
sh: 2015/02/2015-02-19.txt: No such file or directory
sh: 2015/02/2015-02-23.txt: No such file or directory
sh: 2015/02/2015-02-27.txt: No such file or directory
sh: 2015/04/2015-04-01.txt: No such file or directory
sh: 2015/04/2015-04-03.txt: No such file or directory
sh: 2015/04/2015-04-06.txt: No such file or directory
sh: 2015/04/2015-04-07.txt: No such file or directory
sh: 2015/04/2015-04-16.txt: No such file or directory
sh: 2015/04/2015-04-19.txt: No such file or directory
sh: 2015/04/2015-04-21.txt: No such file or directory
sh: 2015/04/2015-04-23.txt: No such file or directory
sh: 2015/04/2015-04-26.txt: 

In [32]:
scrape_year_sampled_tweets("2022")
concatenate_general_txt_files("2022")

sh: 2022/02/2022-02-01.txt: No such file or directory
sh: 2022/02/2022-02-02.txt: No such file or directory
sh: 2022/02/2022-02-04.txt: No such file or directory
sh: 2022/02/2022-02-07.txt: No such file or directory
sh: 2022/02/2022-02-12.txt: No such file or directory
sh: 2022/02/2022-02-16.txt: No such file or directory
sh: 2022/02/2022-02-17.txt: No such file or directory
sh: 2022/02/2022-02-18.txt: No such file or directory
sh: 2022/02/2022-02-19.txt: No such file or directory
sh: 2022/02/2022-02-27.txt: No such file or directory
sh: 2022/04/2022-04-01.txt: No such file or directory
sh: 2022/04/2022-04-08.txt: No such file or directory
sh: 2022/04/2022-04-09.txt: No such file or directory
sh: 2022/04/2022-04-12.txt: No such file or directory
sh: 2022/04/2022-04-14.txt: No such file or directory
sh: 2022/04/2022-04-16.txt: No such file or directory
sh: 2022/04/2022-04-21.txt: No such file or directory
sh: 2022/04/2022-04-22.txt: No such file or directory
sh: 2022/04/2022-04-26.txt: 

### Case 2: Scraping minority keyword-related tweet set

In [7]:
# scrape tweets containing keyword from every month of given year
def scrape_keyword_month_tweets(year: str, months: List[str], days_in_month: int, keyword_jp: str, keyword_eng: str):
    for m in months: 
        command = 'twscrape search "' + keyword_jp + ' since:' + year + '-' + m + '-01_00:00:00_UTC until:' + year + '-' + m + '-' + str(days_in_month) + '_23:59:59_UTC lang:ja"　> ' + keyword_eng + '_' + year + '/' + m + '.txt'
        os.system(command)

In [8]:
# scrape all tweets containing keyword in given year
def scrape_keyword_sampled_tweets(year: str, keyword_jp: str, keyword_eng: str):
    os.mkdir(keyword_eng + "_" + year) 

    # 28 day months
    scrape_keyword_month_tweets(year,["02"], 28, keyword_jp, keyword_eng)    # omit leap year 29th days for simplicity...?

    # 30 day months
    months_30 = ["04","06","09","11"]
    scrape_keyword_month_tweets(year,months_30, 30, keyword_jp, keyword_eng)

    # 31 day months
    months_31 = ["01","03","05","07","08","10","12"]
    scrape_keyword_month_tweets(year,months_31, 31, keyword_jp, keyword_eng)

In [9]:
def concatenate_keyword_txt_files(year: str, keyword_eng=""):
  # concatenate month .txt files into one file for the year
  os.system("cat " + keyword_eng + "_" + year + "/" + "*.txt > " + keyword_eng + "_" + year + ".txt")

In [None]:
# zainichi korean set 2022 [DONE]
scrape_keyword_sampled_tweets("2022", "在日コリアン", "zainichi")
concatenate_keyword_txt_files("2022", "zainichi")

# zainichi korean set 2015 [DONE]
scrape_keyword_sampled_tweets("2015", "在日コリアン", "zainichi")
concatenate_keyword_txt_files("2015", "zainichi")

In [None]:
# ainu 2022 [DONE]
scrape_keyword_sampled_tweets("2022", "アイヌ", "ainu")
concatenate_keyword_txt_files("2022", "ainu")

# ainu 2015 [redo? sufficiently large though]
scrape_keyword_sampled_tweets("2015", "アイヌ", "ainu")
concatenate_keyword_txt_files("2015", "ainu")

In [None]:
# ryukyujin [small dataset; probably done]
scrape_keyword_sampled_tweets("2022", "琉球人", "ryukyujin")
concatenate_keyword_txt_files("2022", "ryukyujin")

# [small dataset; probably done]
scrape_keyword_sampled_tweets("2015", "琉球人", "ryukyujin")
concatenate_keyword_txt_files("2015", "ryukyujin")

In [None]:
# 1, 2, 4, 6, 11 are fine
# redo 3, 5, 7, 8, 10, 12 if time
scrape_keyword_sampled_tweets("2015", "琉球", "ryukyu")
concatenate_keyword_txt_files("2015", "ryukyu")

# redo if time
scrape_keyword_sampled_tweets("2022", "琉球", "ryukyu")
concatenate_keyword_txt_files("2022", "ryukyu")

In [None]:
# okinawajin [DONE]
scrape_keyword_sampled_tweets("2022", "沖縄人", "okinawajin")
concatenate_keyword_txt_files("2022", "okinawajin")

# [DONE; could redo some bc it didn't finish]
scrape_keyword_sampled_tweets("2015", "沖縄人", "okinawajin")
concatenate_keyword_txt_files("2015", "okinawajin")

In [10]:
# haafu
scrape_keyword_sampled_tweets("2022", "ハーフ", "haafu_2")
concatenate_keyword_txt_files("2022", "haafu_2")

2024-03-29 12:15:24.542 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 12:19:05
2024-03-29 12:19:04.980 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_6 on queue SearchTimeline
2024-03-29 12:20:21.851 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 12:27:15
2024-03-29 12:20:41.887 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_6 on queue SearchTimeline
2024-03-29 12:21:16.651 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:275 - No account available for queue "SearchTimeline". Next available at 12:27:15
2024-03-29 12:27:17.387 | INFO     | twscrape.accounts_pool:get_for_queue_or_wait:281 - Continuing with account jp_nlp_res_3 on queue SearchTimeline
2024-03-29 12:30:28.096 | INFO     | twscrape.accounts_pool:g

In [None]:
# haafu [DONE]
scrape_keyword_sampled_tweets("2015", "ハーフ", "haafu")
concatenate_keyword_txt_files("2015", "haafu")

In [None]:
# vietnam
scrape_keyword_sampled_tweets("2022", "ベトナム人", "vietnamjin")
concatenate_keyword_txt_files("2022", "vietnamjin")

scrape_keyword_sampled_tweets("2015", "ベトナム人", "vietnamjin")
concatenate_keyword_txt_files("2015", "vietnamjin")

In [None]:
# philippines
scrape_keyword_sampled_tweets("2022", "フィリピン人", "philippinejin")
concatenate_keyword_txt_files("2022", "philippinejin")

scrape_keyword_sampled_tweets("2015", "フィリピン人", "philippinejin")
concatenate_keyword_txt_files("2015", "philippinejin")

In [None]:
# nepal
scrape_keyword_sampled_tweets("2022", "ネパール人", "nepaljin")
concatenate_keyword_txt_files("2022", "nepaljin")

scrape_keyword_sampled_tweets("2015", "ネパール人", "nepaljin")
concatenate_keyword_txt_files("2015", "nepaljin")

In [None]:
# indonesia
scrape_keyword_sampled_tweets("2022", "インドネシア人", "indonesiajin")
concatenate_keyword_txt_files("2022", "indonesiajin")

scrape_keyword_sampled_tweets("2015", "インドネシア人", "indonesiajin")
concatenate_keyword_txt_files("2015", "indonesiajin")

In [None]:
# gaijin [didn't do; hold]
scrape_keyword_sampled_tweets("2022", "外人", "gaijin")
concatenate_keyword_txt_files("2022", "gaijin")

# [DONE]
scrape_keyword_sampled_tweets("2015", "外人", "gaijin")
concatenate_keyword_txt_files("2015", "gaijin")