In [1]:
import scrapy
import requests
import bz2
import json

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from pathlib import Path

from greenspectors.util.io import download_file, parse_size_to_mb
from greenspectors.env import COMPANY_NAMES, DATA_PATH
from greenspectors.scraping.twitter_archive import TwitterArchiveSpider, TwitterSingleArchiveSpider

In [2]:
settings = get_project_settings()
settings['CONCURRENT_REQUESTS'] = 100
settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 100

In [3]:
process = CrawlerProcess(settings)

2021-10-16 02:50:33 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2021-10-16 02:50:33 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19042-SP0
2021-10-16 02:50:33 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor


In [3]:
archive_urls = []

In [5]:
responses = []

In [6]:
class TwitterArchiveSpider(scrapy.Spider):
    def __init__(self):
        self._collected_tweets = []

    def start_requests(self):
        urls = [
            "https://archive.org/details/twitterstream?&sort=-week&page=1",
            "https://archive.org/details/twitterstream?&sort=-week&page=2"
            # "https://archive.org/download/archiveteam-twitter-stream-2019-02"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        archive_links = response.css("div.results div.item-ia div.C234 a::attr(href)").getall()
        archive_names = [Path(archive_link).stem for archive_link in archive_links]
        
        for archive_name in archive_names:
            yield scrapy.Request(f"https://archive.org/download/{archive_name}", callback=self.parse_download_page)
        
    def parse_download_page(self, response):
        sizes = response.css('table.directory-listing-table tr td:last-child::text').getall()
        sizes = [parse_size_to_mb(size) for size in sizes]
        
        file_links = response.css('table.directory-listing-table tr td:first-child a:first-child::attr(href)').getall()[1:]  # Ignore first link, as it just points back to parent directory
        file_urls = [f"{response.url}/{file}/" for file, size in zip(file_links, sizes) if 100 < size < 5000]
        
        archive_urls.extend(file_urls)
        for file_url in file_urls:
            yield scrapy.Request(file_url, callback=self.parse_archive_listing)
            break

        
    def parse_archive_listing(self, response):
        responses.append(response)

In [4]:
process.crawl(TwitterSingleArchiveSpider, archive_name='archiveteam-twitter-stream-2018-06')
process.start() # the script will block here until the crawling is finished

2021-10-16 02:50:33 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS': 100, 'CONCURRENT_REQUESTS_PER_DOMAIN': 100}
2021-10-16 02:50:33 [scrapy.extensions.telnet] INFO: Telnet Password: 9716c7e6df429fb1
2021-10-16 02:50:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-10-16 02:50:33 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.Redire

Found samples: 1
Found samples: 2
Found samples: 3
Found samples: 4
Found samples: 5
Found samples: 6
Found samples: 7
Found samples: 8
Found samples: 9
Found samples: 10
Found samples: 11
Found samples: 12
Found samples: 13
Found samples: 14
Found samples: 15
Found samples: 16
Found samples: 17
Found samples: 18
Found samples: 19
Found samples: 20
Found samples: 21
Found samples: 22
Found samples: 23
Found samples: 24
Found samples: 25
Found samples: 26
Found samples: 27
Found samples: 28
Found samples: 29
Found samples: 30
Found samples: 31
Found samples: 32


In [17]:
json_urls = []
for response in responses:
    single_json_urls = response.css('table.archext tr td a::attr(href)').getall()
    single_json_urls = [f"https:{json_url}" for json_url in single_json_urls]
    json_urls.extend(single_json_urls)

In [42]:
Path(TMP_DATA_FOLDER).mkdir(exist_ok=True)

In [48]:
url = json_urls[0]
file_name = f"{TMP_DATA_FOLDER}/{'_'.join(Path(url).parts[3:])}"
download_file(url, file_name)

2021-10-15 23:27:39 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): archive.org:443
2021-10-15 23:27:39 [urllib3.connectionpool] DEBUG: https://archive.org:443 "GET /download/archiveteam-twitter-stream-2018-06/twitter-2018-06-06.tar/2018%2F06%2F06%2F10%2F44.json.bz2 HTTP/1.1" 302 None
2021-10-15 23:27:39 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): ia800704.us.archive.org:443
2021-10-15 23:27:40 [urllib3.connectionpool] DEBUG: https://ia800704.us.archive.org:443 "GET /view_archive.php?archive=/29/items/archiveteam-twitter-stream-2018-06/twitter-2018-06-06.tar&file=2018%2F06%2F06%2F10%2F44.json.bz2 HTTP/1.1" 200 None


In [70]:
with bz2.open(file_name, 'r') as f:
    asdf  = f.readlines()[0]

In [76]:
json.loads(asdf)['text']

'@LuuStessens  https://t.co/JlFh74cnME'

In [20]:
suffixes = [Path(url).suffix for url in archive_urls]
print(f".tar files: {len([suffix for suffix in suffixes if suffix == '.tar'])}")
print(f".zip files: {len([suffix for suffix in suffixes if suffix == '.zip'])}")

.tar files: 854
.zip files: 478


In [70]:
sizes = responses[0].css('table.directory-listing-table tr td:last-child::text').getall()
sizes = [parse_size_to_mb(size) for size in sizes]

In [61]:
file_links = responses[0].css('table.directory-listing-table tr td:first-child a:first-child::attr(href)').getall()[1:]  # Ignore first link, as it just points back to parent directory

In [73]:
[(file, size) for file, size in zip(file_links, sizes) if 100 < size < 5000]

[('twitter_stream_2019_02_01.tar', 607.4),
 ('twitter_stream_2019_02_02.tar', 1800.0),
 ('twitter_stream_2019_02_03.tar', 1800.0),
 ('twitter_stream_2019_02_04.tar', 1600.0),
 ('twitter_stream_2019_02_05.tar', 1800.0),
 ('twitter_stream_2019_02_06.tar', 1800.0),
 ('twitter_stream_2019_02_07.tar', 1800.0),
 ('twitter_stream_2019_02_08.tar', 1800.0),
 ('twitter_stream_2019_02_09.tar', 1800.0),
 ('twitter_stream_2019_02_10.tar', 2000.0),
 ('twitter_stream_2019_02_11.tar', 1800.0),
 ('twitter_stream_2019_02_12.tar', 1800.0),
 ('twitter_stream_2019_02_13.tar', 1800.0),
 ('twitter_stream_2019_02_14.tar', 1800.0),
 ('twitter_stream_2019_02_15.tar', 1700.0),
 ('twitter_stream_2019_02_16.tar', 1700.0),
 ('twitter_stream_2019_02_17.tar', 1800.0),
 ('twitter_stream_2019_02_18.tar', 1800.0),
 ('twitter_stream_2019_02_19.tar', 1800.0),
 ('twitter_stream_2019_02_20.tar', 1800.0),
 ('twitter_stream_2019_02_21.tar', 1800.0),
 ('twitter_stream_2019_02_22.tar', 1800.0),
 ('twitter_stream_2019_02_23.tar'

In [11]:
request = scrapy.Request(url=url, callback=parse)

In [18]:
scrapy.cr

<module 'scrapy.http.request' from 'p:\\programming\\miniconda\\envs\\greenspectors\\lib\\site-packages\\scrapy\\http\\request\\__init__.py'>