# Load dataset

In [None]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [None]:
from collections import Counter

# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Let's also limit the number of sites (because this is not cleaned yet)
companies = [c for c in companies if c.get('current_site_count', 0) < 100]

# Let's use only well represented industries
industries = Counter([c['industry'] for c in companies])
companies = [c for c in companies if industries[c.get('industry')] > 500]

## Some helper functions

In [None]:
import os
import gzip

import scandir
from bs4 import BeautifulSoup

from ipyparallel import require

import logging

# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('extract_features.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)


@require(BeautifulSoup, 'gzip')
def get_soup_from_html_gz(path):
    with gzip.open(path, 'rt') as f:  # TODO Apparently, not all files are utf-8 encoded? :o
        soup = BeautifulSoup(f, 'html.parser')
    return soup


@require('os')
def save_texts_for_domain(text, path, file_name=None):
    file_name = file_name or 'full_texts.txt'
    file_name = os.path.join(path, file_name)
    with open(file_name, 'w') as f:
        f.write(text)
    return file_name


@require('scandir', 'os')
def sites_for_domain(path):
    for dirpath, dirs, file_names in scandir.walk(path):
        for file_name in file_names:
            if file_name.endswith('.gz'):
                yield os.path.join(dirpath, file_name)


def get_nav_menu(soup):
    if not soup.body:
        return None
    # We expect nav menu in header
    # First look for possible headers
    header = soup.body.find_all('header')
    if not header:
        header = soup.body.find_all({'class': ['header', 'menu']})
    if not header:
        header = soup.body.find_all(id='header')
    
    # Then look for nav menu in that header
    if header:
        header = header[0]
        has_header = True
    # If not found, look for a menu anywhere
    else:
        has_header = False
        header = soup.body
        
    nav = header.find('nav')
    if not nav:
        nav = soup.body.find('nav')
    if not nav:
        nav = header.find(class_='nav')
    if not nav:
        nav = header.find(id='nav')
    if not nav:
        nav = header.find(class_='menu')
    if not nav:
        nav = header.find(class_='main-menu')

    if nav:
        return [item for li in nav.find_all('li') for item in li.stripped_strings]  # TODO this doesn't deal with submenus
    elif has_header:
        return [item for li in header.find_all('li') for item in li.stripped_strings]
    else:
        return []


@require('os', save_texts_for_domain)
def get_texts_for_domain(path, get_text, force_read=False, file_name=None):
    file_name = file_name or 'full_texts.txt'
    text_file = os.path.join(path, file_name)
    if not os.path.exists(text_file) or force_read:
        try:
            text = get_text(path)
            text_file = save_texts_for_domain(text, path, file_name=file_name)
        except Exception as e:
#             print(e)
            text = None
            text_file = ''
    else:
        with open(text_file, 'r') as f:
            text = f.read()
    return text, text_file


@require('os', save_texts_for_domain)
def combine_metas(path):
    file_names = ['descriptions.txt', 'keywords.txt', 'titles.txt']
    full_metas = ''
    for file_name in file_names:
        text_file = os.path.join(path, file_name)
        if os.path.exists(text_file):
            with open(text_file, 'r') as f:
                text = f.read()
            if text:
                full_metas += text
                full_metas += '\n'
    if full_metas:
        text_file = save_texts_for_domain(full_metas, path, file_name='metas.txt')
    return full_metas, text_file


@require(get_soup_from_html_gz, sites_for_domain)
def get_full_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        body = get_soup_from_html_gz(site_path).body
        if body:
            # We assume this is noise
            for script in body.find_all('script'):
                script.clear()
            text += body.get_text()
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_nav_menu)
def get_nav_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_nav_menu(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


def get_meta_keywords(soup):
    keywords = soup.find('meta', attrs={'name':'keywords'})
    if keywords:
        return [k.strip() for k in keywords.get('content', '').split(',')]
    else:
        return None


def get_meta_description(soup):
    description = soup.find('meta', attrs={'name':'description'})
    if description:
        return description.get('content')
    else:
        return None


def get_site_title(soup):
    title = soup.find('title')
    if title:
        return title.string
    else:
        return None


@require(get_soup_from_html_gz, sites_for_domain, get_meta_keywords)
def get_meta_keywords_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_meta_keywords(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_site_title)
def get_titles_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        title = get_site_title(soup)
        if title:
            text += title
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_meta_description)
def get_meta_descriptions_text(path):
    text = ''
    prev_description = None
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        description = get_meta_description(soup)
        if description:
            if prev_description != description:
                prev_description = description
                text += description
                text += '\n'
    return text

### Scoring function for company_size

In [None]:
def scoring_neighbour(estimator, data, actual):
    predicted = estimator.predict(data)
    correct = 0
    for p, a in zip(predicted, actual):
        if abs(sizes.get(p) - sizes.get(a)) < 1:
            correct += 1
        if abs(sizes.get(p) - sizes.get(a)) < 2:
            correct += 0.5
    return correct/len(actual)

## Set up parallel computing

In [None]:
from ipyparallel import Client


#  Let's parallelize this
c = Client()
dv = c[:]
dv.block = False
dv
lview = c.load_balanced_view()

In [None]:
# Parallel processing of meta tags (to speed up further computing)
from tqdm import tqdm, tqdm_notebook

results = []
for company in tqdm(companies):
    text = ''
    path = company['path_to']
    ar = lview.apply(combine_metas, path)
#     ar = lview.apply(get_texts_for_domain, path, get_full_text, file_name='full_texts.txt', force_read=True)
    results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_meta_descriptions_text, file_name='descriptions.txt')
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_meta_keywords_text, file_name='keywords.txt', force_read=True)
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_titles_text, file_name='titles.txt')
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_nav_text, file_name='nav_menu.txt', force_read=True)
#     results.append(ar)

for r in tqdm(results):
    r.get()

# Extract different features from websites

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.utils import shuffle
from ipyparallel import require


companies = shuffle(companies)
train_data = companies[:120000]
target = []

features = []

@require(sites_for_domain, get_soup_from_html_gz)
def get_features(path):
    image_count = 0
    sites = 0
    form_count = 0
    a_count = 0
    link_count = 0
    meta_count = 0
    for site_path in sites_for_domain(path):
        sites += 1
        soup = get_soup_from_html_gz(site_path)
        image_count += len(soup.find_all('img'))
        form_count += len(soup.find_all('form'))
        a_count += len(soup.find_all('a'))
        link_count += len(soup.find_all('link'))
        meta_count += len(soup.find_all('meta'))
    avg_image_count = image_count / sites if sites else 0
    return [sites, image_count, avg_image_count, form_count, link_count, meta_count]


results = []
for i, company in enumerate(tqdm(train_data)):
#     if company['founded'] > 1800:
    path = company['path_to']
    ar = lview.apply_async(get_features, path)
    results.append(ar)

for i, company in enumerate(tqdm(train_data)):
    try:
        features.append(results[i].get())
        age = 'young' if company['founded'] > 2007 else 'old'
        target.append(age)
#         target.append(company['founded'])
    except Exception as e:
        print(e)

# Accuracy 25% for company_size
# 26% for balanced classifier


### Test with dummy estimators for baseline

In [None]:
# Dummy
from sklearn.dummy import DummyClassifier
from sklearn.metrics import recall_score, accuracy_score, f1_score, precision_score

dummy = DummyClassifier(strategy='constant', constant='young')
dummy.fit(features, target)

# scoring_neighbour(dummy, features_test, target_test)
predicted = dummy.predict(features_test)
print(recall_score(target_test, predicted, pos_label='young'))
print(precision_score(target_test, predicted, pos_label='young'))
print(f1_score(target_test, predicted, pos_label='young'))

In [None]:
# Dummy #2
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error

dummy = DummyRegressor()
dummy.fit(features, target)
mean_absolute_error(target_test, dummy.predict(features_test))

### A real estimator this time

In [None]:
# Try SGD Classifier
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# clf = SGDRegressor(loss='huber', penalty='l2', alpha=0.001,
#                                   epsilon=0.25)

clf = SGDClassifier(loss='hinge', n_jobs=6, alpha=0.1, epsilon=1)

# scaler.fit(features)
# clf.fit(scaler.transform(features), [sizes_cont[c] for c in target])
# clf.fit(scaler.transform(features), target)
clf.fit(features, target)  # 27% - 62%
# clf.fit(features, [sizes_small[c] for c in target])

## Time to test

In [None]:
texts = []
# Test on 1000 samples
test_companies = companies[-10000:]
results = []

# features_test = []
target_test = []

# for i, company in enumerate(tqdm(test_companies)):
#     path = company['path_to']
#     ar = lview.apply_async(get_features, path)
#     results.append(ar)

for i, company in enumerate(tqdm(test_companies)):
    try:
#         features_test.append(results[i].get())
        age = 'young' if company['founded'] > 2007 else 'old'
        target_test.append(age)
#         target_test.append(company['company_size_clean'])
#         target_test.append(company['founded'])
    except Exception as e:
        print(e)

# score = clf.score(scaler.transform(features_test), [sizes_cont[c] for c in target_test])
# score = clf.score(features_test, target_test)
# score = rf.score(features_test, target_test)
# score = clf.score(features_test, [sizes_small[c] for c in target_test])

# Mean absolute error for founded - 21.38 years
# F1 for young vs old - 32% (72% recall, 43% accuracy)
# Mean absolute error for company size - 657.78

In [None]:
from sklearn.metrics import classification_report

# expected = [sizes_small[c] for c in target_test]
expected = target_test
predicted = clf.predict(features_test)
# predicted = ['Not' if p > 0 else 'Food & Beverages' for p in predicted_prob]

print("Classification report for classifier:\n%s\n"
      % classification_report(expected, predicted))


In [None]:
from sklearn.metrics import recall_score, accuracy_score, f1_score, precision_score

# predicted = clf.predict(scaler.transform(features_test))
predicted = clf.predict(features_test)
predicted = ['old' if a < -1.25 else 'young' for a in clf.decision_function(features_test)]

print(recall_score(target_test, predicted, pos_label='young'))
print(accuracy_score(target_test, predicted))
print(f1_score(target_test, predicted, pos_label='young'))

# sum([1 if a == 'old' else 0 for a in predicted])


In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error([sizes_cont[c] for c in target_test], clf.predict(scaler.transform(features_test)))

In [None]:
# Error based on size of the company:

for size in sizes:
    current_target = []
    current_features = []
    for t, f in zip(target_test, features_test):
        if t == size:
            current_target.append(t)
            current_features.append(f)
    if not len(current_target):
        print(size)
        print('no samples')
        continue
    print(size)
#     print(mean_absolute_error([sizes_cont[c] for c in current_target], clf.predict(scaler.transform(current_features))))
    print(scoring_neighbour(clf, current_features, current_target))

In [None]:
# Score regressor like we would score a classifier
predicted_cont = grid.predict(scaler.transform(features_test))
predicted = []
for pc in predicted_cont:
    p = '10,001+'
    for s in inv_sizes:
        if pc <= s[0]:
            p = s[1]
            break
    predicted.append(p)

correct = 0
for actual, predict in zip(target_test, predicted):
    actual_c = sizes.get(actual)
    predict_c = sizes.get(predict)
    if abs(predict_c - actual_c) < 2:
        correct += 1

correct/len(target_test)

In [None]:
sizes = {
    '1': 1,
    '1-10': 2,
    '11-50': 3,
    '51-200': 4,
    '201-500': 5,
    '501-1000': 6,
    '1001-5000': 7,
    '5001-10,000': 8,
    '10,001+': 9,
}

sizes_small = {
    '1': 'small',
    '1-10': 'small',
    '11-50': 'small',
    '51-200': 'medium',
    '201-500': 'medium',
    '501-1000': 'big',
    '1001-5000': 'big',
    '5001-10,000': 'big',
    '10,001+': 'big',
}

sizes_cont = {
    '1': 1,
    '1-10': 5,
    '11-50': 30,
    '51-200': 125,
    '201-500': 350,
    '501-1000': 750,
    '1001-5000': 3000,
    '5001-10,000': 7500,
    '10,001+': 10000,
}

inv_sizes_cont = {v: k for k, v in sizes_cont.items()}

inv_sizes = [
    (1, '1'),
    (10, '1-10'),
    (50, '11-50'),
    (200, '51-200'),
    (500, '201-500'),
    (1000, '501-1000'),
    (5000, '1001-5000'),
    (10000, '5001-10,000')
            ]