## Note
Part of this Notebook expects to be run from email collector django project (to be able to access correct models and data) 

# Import dataset

In [None]:
import json

with open('companies.json', 'r') as f:
    companies = json.load(f)

# Cross Match with Collector Data

## First, from the database

In [None]:
from collector.models import Domain

Domain.objects.all().count()

In [None]:
from common.domains.domain_resolver import extract_domain_from_url

print('All companies: %s' % len(companies))

cross_match = []

total = len(companies)

# The following is slow af, just don't need anything faster or more optimal atm
for index, company in enumerate(companies):
    website = extract_domain_from_url(company['website'])
    domain = Domain.objects.filter(name=extract_domain_from_url(website))
    if domain:
        company['domain'] = domain[0].id
        cross_match.append(company)
    if not index % 10000:
        print("%s%%" % (index / total * 100))

print('Cross match: %s' % len(cross_match))

## Save the cross match to file

In [None]:
import json

with open('companies_cross.json', 'w') as f:
    json.dump(cross_match, f)

## Open cross match from file

In [None]:
import json

with open('companies_cross.json', 'r') as f:
    cross_match = json.load(f)

## Check crawled HTMLs

In [None]:
import os

from tqdm import tqdm

from crawler.file_storage import hostname_to_path
from common.domains.domain_resolver import extract_domain_from_url

count = 0

# The following is very slow (might take ~3h)
# for company in tqdm(cross_match):
for company in tqdm([c for c in cross_match if not c.get('path_to')]):  # Look at only the ones without path
    website = company['website']
    domain = extract_domain_from_url(website)
    path_to = hostname_to_path(domain, root='/srv/data/email_collector/crawler')
    # Check for .tar.gz
    path_to += '.tar.gz'
    if os.path.exists(path_to):
        count += 1
        company['path_to'] = path_to
    else:
        company['path_to'] = None

In [None]:
import os
from scandir import walk

from tqdm import tqdm

from crawler.file_storage import hostname_to_path

count = 0
root = '/srv/data/email_collector/crawler/'

# First, let's create a dict of file paths to avoid doing 400k calls on filesystem
file_path_cache = {}
for (dirpath, sub_dirs, files) in tqdm(walk(root)):
    parts = dirpath.split(root)
    if len(parts) == 2:
        file_path_cache[parts[1]] = {sub_dir: [] for sub_dir in sub_dirs}
    elif len(parts) == 3:
        file_path_cache[parts[1]][parts[2]] = [sub_dir for sub_dir in sub_dirs]
    elif len(parts) > 3:
        # Only go to this depth
        break
    else:
        file_path_cache = {sub_dir: [] for sub_dir in sub_dirs}

for company in tqdm(cross_match):
    website = company['website']
    domain = extract_domain_from_url(website)
    path_to = hostname_to_path(domain, root=root)
    parts = path_to.split(root)
    parts = parts[0].split('/')
    if len(parts) == 2:
        if file_path_cache.get(parts[0], {}).get(parts[1], False):
            count += 1
            company['path_to'] = path_to
            continue
    if os.path.exists(path_to):
        count += 1
        company['path_to'] = path_to
    else:
        company['path_to'] = None

In [None]:
len(cross_match)

## Copy to local machine

In [None]:
from shutil import copytree

from tqdm import tqdm


collector_root = '/srv/data/email_collector/crawler/'
local_root = '/mnt/hugedrive/masters-data/'

for company in tqdm([c for c in cross_match if c['path_to']]):
    path = company['path_to']
    new_path = path.replace(collector_root, local_root)
    
    try:
        copytree(path, new_path)
        company['path_to'] = new_path
    except Exception as e:
        print(str(e))

    

## Getting additional data from collector

In [None]:
from tqdm import tqdm

from collector.models import Site


for company in tqdm([c for c in cross_match if c['path_to']]):
    domain_id = company['domain']
    site_count = Site.objects.filter(domain=domain_id).count()
    company['site_count'] = site_count

## Count how many sites we have

In [None]:
import os

from scandir import walk
from tqdm import tqdm


for company in tqdm(cross_match):
    path = company['path_to']
    site_count = 0
    if path:
        for dirpathath, dirs, file_names in walk(path):
            for file_name in file_names:
                if file_name.endswith('.gz'):
                    site_count += 1
    company['current_site_count'] = site_count