<a href="https://colab.research.google.com/github/tbahng/IST718-FinalProject/blob/master/Download_Open_Resource_Covid_19_Twitter_Chatter_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Open Resource Covid-19 Twitter Chatter Dataset
## Author: Thomas Bahng
## Date: August 8, 2020

This notebook downloads the original data from source repository and writes the tweet IDs of english tweets created in 2020 to a text file.
* data is the cleaned version with no retweets; original data only includes tweet ids which need to be hydrated
* 'full_dataset-clean.tsv' file (142,360,288 unique tweets ids)
* https://zenodo.org/record/3977558#.XzGTbUF7mUk

In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# set working directory + view files in directory
import os
#os.listdir('drive/Shared drives/IST718-Summer2020-Team')
wd = 'drive/Shared drives/IST718-Summer2020-Team'
os.listdir(wd)

['Sample Colab Notebook.ipynb',
 '551982_1359228_bundle_archive.zip',
 'Kaggle',
 'Project Check in 1 Rubric.pdf',
 'Project Check In 1 ROUGH DRAFT.docx',
 'Group1-Section2-Week5ProjectCheckin-2020.docx',
 'IST718-Team-Contact-Info.gdoc',
 'Project Check In 2 ROUGH DRAFT.docx',
 'Copy of Final Project Workbook .ipynb',
 'Twitter']

In [3]:
import requests
# zenodo repository
url = "https://zenodo.org/record/3977558/files/full_dataset_clean.tsv.gz"
# destination filename
filename = wd + '/Twitter/' + url.split("/")[-1] 

In [4]:
%%time
# download gz file from zenodo repository
print("File being saved to: {:s}".format(filename))
with open(filename, "wb") as f:
  r = requests.get(url)
  f.write(r.content)

File being saved to: drive/Shared drives/IST718-Summer2020-Team/Twitter/full_dataset_clean.tsv.gz
CPU times: user 4.12 s, sys: 4.12 s, total: 8.24 s
Wall time: 32.4 s


In [5]:
%%time
import pandas as pd
from tqdm import tqdm

# read file in chunks of size 1000
# there should be 143 iterations
chunk_size = 1000000
chunks = pd.read_csv(filename, compression = 'gzip', sep = '\t', chunksize = chunk_size, iterator = True)

CPU times: user 9.5 ms, sys: 144 ms, total: 154 ms
Wall time: 214 ms


In [6]:
%%time
# collect all english tweets in 2020
import json
import bz2
import gc

# initialize list
idx_list = []

for chunk in chunks:
  data = chunk 
  data['date'] = pd.to_datetime(data['date'])
  data['year'] = [x.year for x in data['date']]
  # filter for english language tweets created in 2020
  data = data.loc[(data.lang == 'en') & (data.year == 2020),:]
  idx_list.extend(data['tweet_id'])
  del data
  gc.collect()

CPU times: user 8min 52s, sys: 12.1 s, total: 9min 4s
Wall time: 9min 5s


In [8]:
print("There are a total of {:d} english tweets created in 2020".format(len(idx_list)))

There are a total of 81083863 english tweets created in 2020


In [13]:
%%time
# write to file
fname = wd + '/Twitter/tweet_ids/english2020.txt'
with open(fname, 'w') as f:
    for item in idx_list:
        f.write("%s\n" % item)

CPU times: user 31.1 s, sys: 1.52 s, total: 32.6 s
Wall time: 40.3 s


In [15]:
print("Tweet IDs saved to:\n {:s}".format(fname))

Tweet IDs saved to:
 drive/Shared drives/IST718-Summer2020-Team/Twitter/tweet_ids/english2020.txt
