In [8]:
import magic
import subprocess
import re
import matplotlib.pyplot as plt
from ipfsapi import ipfsapi
from enum import Enum
from tempfile import NamedTemporaryFile
from textblob import TextBlob
from typing import List, Optional

api = ipfsapi.connect('127.0.0.1', 5001)

class IPFSHashType(Enum):
    DATA = 0
    DIR = 1
    ERR = 2

class WikiType(Enum):
    HTMLPAGE = 0
    CRAWLED = 1
    NONE = 2

class Source(Enum):
    DHT = 0
    WEB = 1


In [9]:
wiki_hashes = set()

with open("wikipedia_results.txt", "r") as f:
    for wiki_hash in f.read().split():
        wiki_hashes.add(wiki_hash)

In [10]:
done_hashes = {}

In [11]:
class FileHash:
    ipfs_hash: str
    level: int
    hash_type: IPFSHashType = IPFSHashType.DATA
    refs = []
    source: Source
    file_type: str = ''
    wiki: WikiType = WikiType.NONE
    language_code: Optional[str] = None
    
    def __init__(self, ipfs_hash: str, source: Source, level: int = 0):
        try:
            self.ipfs_hash = ipfs_hash
            self.level = level
            self.source = source

            if ipfs_hash in done_hashes:
                return

            done_hashes[ipfs_hash] = None
            print("{} {}".format(len(done_hashes), ipfs_hash))
            self.readline()
            done_hashes[ipfs_hash] = self
        except Exception as e:
            print(ipfs_hash)
            print(e)

    def __str__(self) -> str:
        return '{} {} {}'.format(self.ipfs_hash, self.hash_type, self.file_type)

    def readline(self) -> None:
        try:
            line = api.cat(self.ipfs_hash, length = 10000)
            self.set_file_type(line)
            self.set_wikipedia(line)
        except ipfsapi.exceptions.Error as e:
            self.hash_type = IPFSHashType.DIR
            
            # Only go one level deep on the refs
            if level <= 0:
                self.add_refs()
    
    def add_refs(self):
        res = api.ls(self.ipfs_hash)['Objects'][0]['Links']
        res = map(lambda link: link['Hash'], res)
        for h in res:
            self.refs.append(FileHash(h, self.source, self.level + 1))
        
    def set_file_type(self, line: bytes):
        if self.hash_type != IPFSHashType.DATA:
            raise Exception('cannot get file type of non-file')
        f = NamedTemporaryFile()
        f.write(line)
        f.seek(0)
        self.file_type = magic.from_file(f.name);

    def set_wikipedia(self, line: bytes):
        if b'mediawiki' in line:
            self.wiki = WikiType.HTMLPAGE
            regex = r'.*(<p id=.*</p>).*'
            match = re.match(regex, str(line))
            if len(match.groups()) > 0:
                self.language_code = TextBlob(match.groups()[0]).detect_language()
        elif self.ipfs_hash in wiki_hashes:
            self.wiki = WikiType.CRAWLED
        

In [12]:
def analyze(source: Source, hashes: List[str]) -> None:
    for h in hashes:
        FileHash(h, source = source)

In [13]:
dht_hashes = []
with open("ipfs_dht_results.txt", "r") as f:
    for line in f.read().split('\n'):
        if line == '':
            continue
        if line[0] == ' ':  # only accept root hashes
            continue
        file_hash = line.split()[-1]
        dht_hashes.append(file_hash)

In [14]:
analyze(Source.DHT, dht_hashes)

1 QmbXM2h2ZUWTgwm4QWwGfVymRqy4XRNq88L8qiuGn91n4V
QmbXM2h2ZUWTgwm4QWwGfVymRqy4XRNq88L8qiuGn91n4V
request() got an unexpected keyword argument 'length'
2 QmNb8n9aQncEZU9hULkQZycVBCEvnbNPG4cFw992KDvBYb
QmNb8n9aQncEZU9hULkQZycVBCEvnbNPG4cFw992KDvBYb
request() got an unexpected keyword argument 'length'
3 QmWjbzihPPVaBNRrboYhFBcpqLWGjyJysR9UJCLT3dCPCv
QmWjbzihPPVaBNRrboYhFBcpqLWGjyJysR9UJCLT3dCPCv
request() got an unexpected keyword argument 'length'
4 QmWA75VQn1zqxXziTeFDi7FXXqQe71nHaenT5k3YmUHjWz
QmWA75VQn1zqxXziTeFDi7FXXqQe71nHaenT5k3YmUHjWz
request() got an unexpected keyword argument 'length'
5 QmYxCGoyxr8QM5QXxEs5SP4CCqVuFRmGsypQQ6hKYFnATa
QmYxCGoyxr8QM5QXxEs5SP4CCqVuFRmGsypQQ6hKYFnATa
request() got an unexpected keyword argument 'length'
6 QmajhgrBLYtYHUPhevtmVuGrmHHkQfeBqNErSrR8M3ZACe
QmajhgrBLYtYHUPhevtmVuGrmHHkQfeBqNErSrR8M3ZACe
request() got an unexpected keyword argument 'length'
7 QmaZwLTo97ar4b36zRkRQ9sCrVmKaUKWdoDA43iWCMdi3q
QmaZwLTo97ar4b36zRkRQ9sCrVmKaUKWdoDA43iWCMdi3q
requ