In [8]:
import magic
import subprocess
import re
import myipfsapi
import matplotlib.pyplot as plt
from enum import Enum
from tempfile import NamedTemporaryFile
from textblob import TextBlob
from typing import List, Optional

api = myipfsapi.connect('127.0.0.1', 5001)

class IPFSHashType(Enum):
    DATA = 0
    DIR = 1
    ERR = 2

class WikiType(Enum):
    HTMLPAGE = 0
    CRAWLED = 1
    NONE = 2

class Source(Enum):
    DHT = 0
    WEB = 1


In [9]:
wiki_hashes = set()

with open("wikipedia_results.txt", "r") as f:
    for wiki_hash in f.read().split():
        wiki_hashes.add(wiki_hash)

In [15]:
done_hashes = {}
bad_hashes = [
    'zCT5htkeDzF9WSWfTFxUfXjUudd9WNckGmnUnBnWCo1u9wb7aiM1',
    'QmSEXfAyvLwsPpWEjzeFp3KHpadGDre2y7zxMVbY5cman1'
]
for h in bad_hashes:
    done_hashes[h] = None

In [16]:
class FileHash:
    ipfs_hash: str
    level: int
    hash_type: IPFSHashType = IPFSHashType.ERR
    refs = []
    source: Source
    file_type: str = ''
    wiki: WikiType = WikiType.NONE
    language_code: Optional[str] = None
    
    def __init__(self, ipfs_hash: str, source: Source, level: int = 0):
        try:
            self.ipfs_hash = ipfs_hash
            self.level = level
            self.source = source

            if ipfs_hash in done_hashes:
                return

            done_hashes[ipfs_hash] = self
            print("{} {}".format(len(done_hashes), ipfs_hash))
            self.readline()
        except Exception as e:
            print(ipfs_hash)
            print(e)

    def __str__(self) -> str:
        return '{} {} {}'.format(self.ipfs_hash, self.hash_type, self.file_type)

    def readline(self) -> None:
        try:
            line = api.cat(self.ipfs_hash, length = 10000)
            self.hash_type = IPFSHashType.DATA
            self.set_file_type(line)
            self.set_wikipedia(line)
        except myipfsapi.exceptions.Error as e:
            self.hash_type = IPFSHashType.DIR
            
            # Only go one level deep on the refs
            if self.level <= 0:
                self.add_refs()
    
    def add_refs(self):
        res = api.ls(self.ipfs_hash)['Objects'][0]['Links']
        res = map(lambda link: link['Hash'], res)
        for h in res:
            self.refs.append(FileHash(h, self.source, self.level + 1))
        
    def set_file_type(self, line: bytes):
        if self.hash_type != IPFSHashType.DATA:
            raise Exception('cannot get file type of non-file')
        f = NamedTemporaryFile()
        f.write(line)
        f.seek(0)
        self.file_type = magic.from_file(f.name);

    def set_wikipedia(self, line: bytes):
        if b'mediawiki' in line:
            self.wiki = WikiType.HTMLPAGE
            regex = r'.*(<p id=.*</p>).*'
            match = re.match(regex, str(line))
            if len(match.groups()) > 0:
                self.language_code = TextBlob(match.groups()[0]).detect_language()
        elif self.ipfs_hash in wiki_hashes:
            self.wiki = WikiType.CRAWLED
        

In [17]:
def analyze(source: Source, hashes: List[str]) -> None:
    for h in hashes:
        FileHash(h, source = source)

In [18]:
dht_hashes = []
with open("ipfs_dht_results.txt", "r") as f:
    for line in f.read().split('\n'):
        if line == '':
            continue
        if line[0] == ' ':  # only accept root hashes
            continue
        file_hash = line.split()[-1]
        dht_hashes.append(file_hash)

In [19]:
analyze(Source.DHT, dht_hashes)

3 QmbXM2h2ZUWTgwm4QWwGfVymRqy4XRNq88L8qiuGn91n4V
4 QmNb8n9aQncEZU9hULkQZycVBCEvnbNPG4cFw992KDvBYb
5 QmWjbzihPPVaBNRrboYhFBcpqLWGjyJysR9UJCLT3dCPCv
6 QmWA75VQn1zqxXziTeFDi7FXXqQe71nHaenT5k3YmUHjWz
7 zb2rhncL9aKqRJfX1YJeRh3ejoHrLfdtobC27MQcXBTWKjGyV
8 zb2rhcw53HcfzT4YnaKKChqGKKggL7X1Zgmd4ZJQkeQmPbxiF
9 zb2rhokijGgdSRe4bUbw2x3Ws9zXewx9RJ9QXRMjxgLuxMz83
10 zb2rhkhRLcgGwWNRu7g17xb2wqxAYTm4qaH3zwkLcK1CeeQFK
11 zb2rhgbBnpG5b4yxFAoPwxSd6FvKQBBsXeYxGTVDAVUdyTCcG
12 zb2rhbN4CWw9VLK5SJnotxexchemBnWxNqD1gYzQkzYuz37Z3
13 zb2rhjd7ZduUyDadkcVFnZVVmdUzVa7CHU4SwNqNd5JUL54sa
14 zb2rhd1wKCePUTNzuzzFcn5Etghfgtna1jaBYUDWydds66YZt
15 zb2rhaUitadBVKz52pQchiCNT3vGz2PCEe3DcdiZvz9Qb7LKD
16 zb2rhkAjTYFajPsXZaYP1HhzuhRtuqaAb8DEFzkLe2dSeFq8E
17 zb2rhbUM9tZR4bd1hjZifUgSi1DAoXHBbJ9YAkmjzA4uP1Q43
18 zb2rhn1oDyU1WmRTkEEVAVuufdapYsMQ8z1bVv3iY7LGq8ALP
19 zb2rhfr9tMUVgC87Zd9FMQRHKYAiwDvHwR4itrZNDrEiSnT5C
20 zb2rhfdvquFSeuWAfenvai2WX5iRXKNpsYVRKPdKLMAfex5kJ
21 zb2rhjAi2yvz5hdthamuxuBm36Ep5ZFcqshC7C28hQFVpnNHW
22 zb2rhiwwg

163 zb2rhgjWsXN9kYw7arfodiij9zTfjwThcajArVuuy2bwXEKs5
164 zb2rhg52D6BcUckmYrqknuAH9n2T8aKJor9Hf4rh3syLBnvQF
165 zb2rhZe3Ed6nhkr21798Evn4CMZ51Bd33884gYmd8KY1tzBGK
166 zb2rhaDpogXxELEaCHttBMNGEjcVjmP51XXD3KwvTb9nCqtau
167 zb2rhnBV7v66Dy5R3xX7hvoo69CvwiHJgBfVH4sDT6zPS3G8q
168 zb2rhmvtBefFxq5XTBgHTGUdgnWw9qLYQsji744B1G7DnQQf9
169 zb2rhZurykfB8dHaMp6rJZD3NgVv6Y5dbHEUzzRcrK1ppE8xq
170 zb2rhf662umvTzKwGkDPvsUo9PPUc6Q1CmjMZAacrGw3qN2Sd
171 zb2rhYFHnqFnvkroP7xp2zY2US8cjJJmoo7PxQvsbktjAZFEK
172 zb2rhaNfrnGgmpx5mZfFg4oeQEK6jrxznr8kbtpsquGjoBU8o
173 zb2rhWgv193BiZzLj5CVvPkdFt3njeyCdivHHnTsB6Q715nad
174 zb2rhnDoQ7a6SBTfb4KGSvZjApQy3VvJAwarVosck3D8Y3xbg
175 zb2rhkNw85m8UvwkBPb17GTAV3fDS9Msj6yF2Xq883Zuf7FQk
176 zb2rhh7xiJiyKkQFJunNvVFz3Df4vJQGh9mXCzVuZWRuRbGib
177 zb2rhnbHtoijah9qtMto9Xn8LE9FCxwGSxjSLeNnmZn7e3GL3
178 zb2rhbjcs6mWiCLvLn7CeJah2ZHCaikJDFVVpbmYvr733s81h
179 zb2rhXXVCSwSAaN2SRQ7MQ3wmXAXPfkmZBUcHz77HkapE5tkM
180 zb2rhi5mWT9PtGrn2rML47FmEY16yeg1XG31Jr9ycwdqCfkXV
181 zb2rhXCmd8joKDMNQimPmrsP

314 zb2rhkwCKTHP4Zk5xaoLbm8qettuaT5ML4RbhyKKYYwYvC4T3
315 zb2rheAesH67WNRWu9QM9ew3mjQJg5EoRYGgokYLLZ4M3YXrB
316 zb2rhicdjGfq6R6K9MZkzTrosCEcexBSuiUnxwTHPF5LXpDu4
317 zb2rhevjjQFgku9w6y4m9puTfWXki8ea2L7ctCqgtiNw65Suv
318 zb2rhdSRVMXvKnqDnsKHzLbfP8DttKahfw5QsHZ5boCaEVoA8
319 zb2rhjeKFAQyjf2AB6oHsz3xSH7EX78vcTCP2TjGSJcWwmVvB
320 zb2rhaECDvtazAHDcjgnJq7JRkrAoVHQYew3p8wD3mSYFokFq
321 zb2rhjErj6NKZh5jjbXVzeZ6Akb88ujUKVik611X3tF2BamQw
322 zb2rhkpKBquhb9yXQ4SJgYbhwVbGtywghySrmoTcAhqmNvMhM
323 zb2rhYZ1Bz1C3yAJNkPtUG3j8cjMkp8CMSPDC5ksjtHftyqrU
324 zb2rhksK15hyrtAHSa5wHYp4mMeZoR5MAbcaoss2H6YFZBg9V
325 zb2rhbyFyXu5gyhH8DT2nVU5fRPiBqBKQx3BpJ29CGmzJBrpp
326 zb2rhkTZkrs2fssdhq87HM9SZh4sEeRNdw7ybTBabHtQUMShV
327 zb2rhjn5pgQsFUrGnfBvfkWjRgau9kZTQkPzM18PJFHuC9Xc9
328 zb2rhfwK6CspvjLzngadJGiqUPWgQ1dJRjho8JFnbWXfDEnbp
329 zb2rhX8QEF25oiY8JhbuUWT4MH34uitQhZMKXJAqWRxVxFLHL
330 zb2rhnsK8K5ZXYTYSMVnT2MK8VCFbtymY4ZZmo8QwoLAQTPTV
331 zb2rhXK1Wn6BfzdA9annT1jRLrWZQBeLFjaKXxiQThSEe6Efa
332 zb2rhdPEaAC64LSrUig19pmm

zb2rhYJsA4bL5URLA8TDQJCJaTq3kqAD1YrYFu9gS3KuHw2CZ
HTTP Error 503: Service Unavailable
400 zb2rhmJpgrv6xag17nPAqQouRGspNgzP86qaSr1B3iRXNk64a
zb2rhmJpgrv6xag17nPAqQouRGspNgzP86qaSr1B3iRXNk64a
HTTP Error 503: Service Unavailable
401 zb2rhYkvSL1qjmQe3n86EqfY38ubJYYAdSf7TiEGjTzDN2BKW
zb2rhYkvSL1qjmQe3n86EqfY38ubJYYAdSf7TiEGjTzDN2BKW
HTTP Error 503: Service Unavailable
402 zb2rhaA1NdxzkVcoEY868CySKz7TMTGxkBsH5RnZCxJign8hC
zb2rhaA1NdxzkVcoEY868CySKz7TMTGxkBsH5RnZCxJign8hC
HTTP Error 503: Service Unavailable
403 zb2rhYPvRjyTEijP7h6LvgrbGm6wF9mp3V456UBzcswWJ5cqB
zb2rhYPvRjyTEijP7h6LvgrbGm6wF9mp3V456UBzcswWJ5cqB
HTTP Error 503: Service Unavailable
404 zb2rhnRusZUneK6J6cgcp7sYX1kZoskuPQgErbgFWNBkFWJA7
zb2rhnRusZUneK6J6cgcp7sYX1kZoskuPQgErbgFWNBkFWJA7
HTTP Error 503: Service Unavailable
405 zb2rhhHA5oemVR2Rs86DbT7F7T7wHosM62MXz9jQLGDEJ3Ydk
zb2rhhHA5oemVR2Rs86DbT7F7T7wHosM62MXz9jQLGDEJ3Ydk
HTTP Error 503: Service Unavailable
406 zb2rhavmLUW7MsRiU9h4CP6wcujx1sALuYGPEU96M9Baexovn
zb2rhavmLUW7MsRiU9h4

zb2rhdafi25h3TUjaqac75hZE1F8kN8EjJFfG3WYf3m6nynsX
HTTP Error 503: Service Unavailable
459 zb2rhePUaJpdo7anP8ijyt6AyzBXuNRoyJHPBY7EBkTntCd2t
zb2rhePUaJpdo7anP8ijyt6AyzBXuNRoyJHPBY7EBkTntCd2t
HTTP Error 503: Service Unavailable
460 zb2rhhAxyNgv3F5W5MuYXXKSfBk9SyS8rztWoTWb5hbwcdQqm
zb2rhhAxyNgv3F5W5MuYXXKSfBk9SyS8rztWoTWb5hbwcdQqm
HTTP Error 503: Service Unavailable
461 zb2rhavJpRMJjeqJZNbnZasZrebgapSTiVPg6a6GS7wBPyAa6
zb2rhavJpRMJjeqJZNbnZasZrebgapSTiVPg6a6GS7wBPyAa6
HTTP Error 503: Service Unavailable
462 zb2rhnhJFhyNNUgHJpiMDtvxQ15YTe4fnKUT6hY1hy8TBShiD
zb2rhnhJFhyNNUgHJpiMDtvxQ15YTe4fnKUT6hY1hy8TBShiD
HTTP Error 503: Service Unavailable
463 zb2rhiJtQbBxmwUvygWNZwaVst1pbwqPF4XobC2RLEPFF8VnA
zb2rhiJtQbBxmwUvygWNZwaVst1pbwqPF4XobC2RLEPFF8VnA
HTTP Error 503: Service Unavailable
464 zb2rheFgTeeseVD5yVTJRcgDESWvd8ukWeZXQo3AvGquhkEfZ
zb2rheFgTeeseVD5yVTJRcgDESWvd8ukWeZXQo3AvGquhkEfZ
HTTP Error 503: Service Unavailable
465 zb2rhboe4HzeFGpe28iNwb2N6xcq42YPqNWSnW1bvKFcHkGyP
zb2rhboe4HzeFGpe28iN

zb2rhhaZ1sBNbkqMEyLVV49PHJTmwL7JqRSd256Uktjp5ndZt
HTTP Error 503: Service Unavailable
518 zb2rhiPimb57C2XYmmy7ANuXXhKxypSJZXvXk5jLFvP72xz18
zb2rhiPimb57C2XYmmy7ANuXXhKxypSJZXvXk5jLFvP72xz18
HTTP Error 503: Service Unavailable
519 zb2rhmXNbSa74ZdcYKbFCL4w8YpK6thuRqzDkXa4TZru573Pw
zb2rhmXNbSa74ZdcYKbFCL4w8YpK6thuRqzDkXa4TZru573Pw
HTTP Error 503: Service Unavailable
520 zb2rhk3cCe5f57yF4qtscg97hkZJM8jiKMnf818DMNaEafMj1
zb2rhk3cCe5f57yF4qtscg97hkZJM8jiKMnf818DMNaEafMj1
HTTP Error 503: Service Unavailable
521 zb2rhmRThWuUzJfaatQkabwMQhDcSP6R93FWVEERHnh4kUeT6
zb2rhmRThWuUzJfaatQkabwMQhDcSP6R93FWVEERHnh4kUeT6
HTTP Error 503: Service Unavailable
522 zb2rhfHTJb5iEhCgDhiii1p6EHHrjVm6wyd6vxzC7seAgofqe
zb2rhfHTJb5iEhCgDhiii1p6EHHrjVm6wyd6vxzC7seAgofqe
HTTP Error 503: Service Unavailable
523 zb2rhY6QK9nHvnxJ8fJ4XYSxSk7m16yHfeLAoPuXmeopiW51T
zb2rhY6QK9nHvnxJ8fJ4XYSxSk7m16yHfeLAoPuXmeopiW51T
'NoneType' object has no attribute 'groups'
524 zb2rhak5UvvquPfJEZU3sEaYjuL59ysfqLCyY1JzTMc4GeLsM
zb2rhak5Uvvq

zb2rho7HRCviv1n8kRqNfqex3LBTQhS53gCtr464i3ZYTXr9z
HTTP Error 503: Service Unavailable
577 zb2rhgcYUU9twtMJqr2mxHicEjzsUiGb2zcX68VAzU43gRCgd
zb2rhgcYUU9twtMJqr2mxHicEjzsUiGb2zcX68VAzU43gRCgd
HTTP Error 503: Service Unavailable
578 zb2rheVfpfawkYPa2pvmy2D4p2nbhzDPusQZPoby82RBogaEU
zb2rheVfpfawkYPa2pvmy2D4p2nbhzDPusQZPoby82RBogaEU
HTTP Error 503: Service Unavailable
579 zb2rheLTfjJm2xjrkTmFRpdyMBZZ9wVCTjTvjFWShQJSJFe2K
zb2rheLTfjJm2xjrkTmFRpdyMBZZ9wVCTjTvjFWShQJSJFe2K
HTTP Error 503: Service Unavailable
580 zb2rhXXgWCSR2tQm7KThmzn8aB1ZGNpivn5TDnFwDM4nzyxw9
zb2rhXXgWCSR2tQm7KThmzn8aB1ZGNpivn5TDnFwDM4nzyxw9
HTTP Error 503: Service Unavailable
581 zb2rhc7x6Nd636EG7PjjG3XtfefwnnSVV74w9K9cGhYg5VrFR
zb2rhc7x6Nd636EG7PjjG3XtfefwnnSVV74w9K9cGhYg5VrFR
HTTP Error 503: Service Unavailable
582 zb2rhkiVgpGa2a7edzhNfYUef8Bwe4Xm11xcEV9mk2u7NCKKL
zb2rhkiVgpGa2a7edzhNfYUef8Bwe4Xm11xcEV9mk2u7NCKKL
HTTP Error 503: Service Unavailable
583 zb2rhmZpYr4oBvuG3DBMGKqrFkEe1JkgvqYC22KB3Nou1CSMz
584 QmWJSeg1cU7ewWCP

KeyboardInterrupt: 