In [42]:
import magic
import subprocess
import ipfsapi
import re
from enum import Enum
from tempfile import NamedTemporaryFile
from textblob import TextBlob
from typing import List, Optional

api = ipfsapi.connect('127.0.0.1', 5001)

class IPFSHashType(Enum):
    DATA = 0
    DIR = 1
    ERR = 2

class WikiType(Enum):
    HTMLPAGE = 0
    CRAWLED = 1
    NONE = 2


In [43]:
wiki_hashes = set()
done_hashes = {}

with open("ls_results.txt", "r") as f:
    for wiki_hash in f.read().split():
        wiki_hashes.add(wiki_hash)


In [61]:
class FileHash:
    ipfs_hash: str
    root: bool = False
    hash_type: IPFSHashType = IPFSHashType.DATA
    line: bytes = b''
    refs = []

    file_type: str = ''
    wiki: WikiType = WikiType.NONE
    language_code: Optional[str] = None
    
    def __init__(self, ipfs_hash: str, root: bool = False):
        self.ipfs_hash = ipfs_hash
        self.root = root
        
#         if ipfs_hash in done_hashes:
#             return

        done_hashes[ipfs_hash] = None
        self.readline()
        self.set_wikipedia()
        done_hashes[ipfs_hash] = self

    def __str__(self) -> str:
        return '{} {} {} {} {}'.format(self.ipfs_hash, self.hash_type, self.file_type, self.wiki, self.language_code)

    def readline(self) -> None:
        try:
            self.line = api.cat(self.ipfs_hash)
            self.set_file_type()
        except ipfsapi.exceptions.Error as e:
            self.hash_type = IPFSHashType.DIR
            self.add_refs()
    
    def add_refs(self):
        res = api.ls(self.ipfs_hash)['Objects'][0]['Links']
        res = map(lambda link: link['Hash'], res)
        for h in res:
            self.refs.append(FileHash(h, root = False))
        
    def set_file_type(self):
        if self.hash_type != IPFSHashType.DATA:
            raise Exception('cannot get file type of non-file')
        f = NamedTemporaryFile()
        f.write(self.line)
        f.seek(0)
        self.file_type = magic.from_file(f.name);

    def set_wikipedia(self):
        if b'mediawiki' in self.line:
            self.wiki = WikiType.HTMLPAGE
            regex = r'.*(<p id=.*</p>).*'
            match = re.match(regex, str(self.line))
#             print(match.groups()[0])
            self.language_code = TextBlob(match.groups()[0]).detect_language()
        elif self.ipfs_hash in wiki_hashes:
            self.wiki = WikiType.CRAWLED


In [62]:
def analyze(hashes: List[str]) -> None:
    for h in hashes:
        FileHash(h, root = True)

In [63]:
hashes = [
    'QmdXunjg8mhwS9j71Hc4aaUCuh3rBHmjx2RsEZXCgBghS3'
]

analyze(hashes)

In [64]:
for h in done_hashes:
    fh = done_hashes[h]
    print(fh)

QmdXunjg8mhwS9j71Hc4aaUCuh3rBHmjx2RsEZXCgBghS3 IPFSHashType.DIR  WikiType.NONE None
zb2rhbmdu1EAoLc1D7gQsrPoouH6wT6LhhPwgarGWzWSRbifZ IPFSHashType.DATA HTML document, UTF-8 Unicode text, with very long lines WikiType.HTMLPAGE en
zb2rhYPXSZPNWEXrULeKynz8udHSJ6MvNdFh1fXZFcC2AzQiH IPFSHashType.DATA HTML document, UTF-8 Unicode text, with very long lines WikiType.HTMLPAGE tr
zb2rhYsHu5LBbrVN1ivxqKiE1GAZHbeyUapp47eXDFvkMXPMa IPFSHashType.DATA HTML document, UTF-8 Unicode text, with very long lines WikiType.HTMLPAGE tr
zb2rhgQmffmc7qiXVrMGdMLrVRcsYtd5B3seNUchMd7t3SeZJ IPFSHashType.DATA HTML document, UTF-8 Unicode text, with very long lines WikiType.HTMLPAGE tr
zb2rheTqcxpnDwpcoHP1h5PzU23n3B6tX6RMvkxHLjE3XtJsB IPFSHashType.DATA HTML document, UTF-8 Unicode text, with very long lines WikiType.HTMLPAGE en
zb2rhhxkUm98WpJf9r645fThqsBzGziNy5bpbEMFXhmx1zhPg IPFSHashType.DATA HTML document, UTF-8 Unicode text, with very long lines WikiType.HTMLPAGE en
zb2rhnVhvdq3rtDfkNidEQ9Y4aSNFbxhF2EAKtRwiUMMzk