# Features
There are many features that we want to extract from the websites:

* Size (total, text, css, js) - we have to ignore css and js for now, since we don't have them
* Number of images
* Number of forms
* Number of mailto links
* Bag of words (the whole website)
* Only specific words (based on tags)
* Number of items in navigation menu
* Navigation menu
* Meta description
* Meta keywords

Optionally, we might want to extend this to:

* Topics (that we extract)
* Genres of the sites (extracted as well)
* Check for iframes, Bootstrap, React, etc.
* Check for Google Analytics
* Check for Facebook/Twitter/Google+ meta elements
* How much are HTML5 and CSS3 elements used?

# Load dataset

In [1]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [2]:
from collections import Counter

# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Let's also limit the number of sites (because this is not cleaned yet)
companies = [c for c in companies if c.get('current_site_count', 0) < 100]

# Let's use only well represented industries
industries = Counter([c['industry'] for c in companies])
companies = [c for c in companies if industries[c.get('industry')] > 500]

In [97]:
for c in companies:
    if c['founded'] < 1000:
        print(c)
        break

{'path_to': '/mnt/hugedrive/masters-data/fr/monnaiedeparis/www', 'id': 3196368, 'current_site_count': 6, 'company_size_clean': '501-1000', 'site_count': 1, 'domain': 971280, 'industry': 'Museums and Institutions', 'company_size': '501-1000 employees', 'founded': 864, 'website': 'http://www.monnaiedeparis.fr', 'company_name': 'Monnaie de Paris', 'website_lang': 'en'}


## Some helper functions

In [9]:
import os
import gzip

import scandir
from bs4 import BeautifulSoup

from ipyparallel import require

import logging

# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('extract_features.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)


@require(BeautifulSoup, 'gzip')
def get_soup_from_html_gz(path):
    with gzip.open(path, 'rt') as f:  # TODO Apparently, not all files are utf-8 encoded? :o
        soup = BeautifulSoup(f, 'html.parser')
    return soup


@require('os')
def save_texts_for_domain(text, path, file_name=None):
    file_name = file_name or 'full_texts.txt'
    file_name = os.path.join(path, file_name)
    with open(file_name, 'w') as f:
        f.write(text)
    return file_name


@require('scandir', 'os')
def sites_for_domain(path):
    for dirpath, dirs, file_names in scandir.walk(path):
        for file_name in file_names:
            if file_name.endswith('.gz'):
                yield os.path.join(dirpath, file_name)


def get_nav_menu(soup):
    if not soup.body:
        return None
    # We expect nav menu in header
    # First look for possible headers
    header = soup.body.find_all('header')
    if not header:
        header = soup.body.find_all({'class': ['header', 'menu']})
    if not header:
        header = soup.body.find_all(id='header')
    
    # Then look for nav menu in that header
    if header:
        header = header[0]
        has_header = True
    # If not found, look for a menu anywhere
    else:
        has_header = False
        header = soup.body
        
    nav = header.find('nav')
    if not nav:
        nav = soup.body.find('nav')
    if not nav:
        nav = header.find(class_='nav')
    if not nav:
        nav = header.find(id='nav')
    if not nav:
        nav = header.find(class_='menu')
    if not nav:
        nav = header.find(class_='main-menu')

    if nav:
        return [item for li in nav.find_all('li') for item in li.stripped_strings]  # TODO this doesn't deal with submenus
    elif has_header:
        return [item for li in header.find_all('li') for item in li.stripped_strings]
    else:
        return []


@require('os', save_texts_for_domain)
def get_texts_for_domain(path, get_text, force_read=False, file_name=None):
    file_name = file_name or 'full_texts.txt'
    text_file = os.path.join(path, file_name)
    if not os.path.exists(text_file) or force_read:
        try:
            text = get_text(path)
            text_file = save_texts_for_domain(text, path, file_name=file_name)
        except Exception as e:
#             print(e)
            text = None
            text_file = ''
    else:
        with open(text_file, 'r') as f:
            text = f.read()
    return text, text_file


@require(get_soup_from_html_gz, sites_for_domain)
def get_full_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        body = get_soup_from_html_gz(site_path).body
        if body:
            # We assume this is noise
            for script in body.find_all('script'):
                script.clear()
            text += body.get_text()
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_nav_menu)
def get_nav_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_nav_menu(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


def get_meta_keywords(soup):
    keywords = soup.find('meta', attrs={'name':'keywords'})
    if keywords:
        return [k.strip() for k in keywords.get('content', '').split(',')]
    else:
        return None


def get_meta_description(soup):
    description = soup.find('meta', attrs={'name':'description'})
    if description:
        return description.get('content')
    else:
        return None


def get_site_title(soup):
    title = soup.find('title')
    if title:
        return title.string
    else:
        return None


@require(get_soup_from_html_gz, sites_for_domain, get_meta_keywords)
def get_meta_keywords_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_meta_keywords(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_site_title)
def get_titles_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        title = get_site_title(soup)
        if title:
            text += title
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_meta_description)
def get_meta_descriptions_text(path):
    text = ''
    prev_description = None
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        description = get_meta_description(soup)
        if description:
            if prev_description != description:
                prev_description = description
                text += description
                text += '\n'
    return text

## Set up parallel computing

In [4]:
from ipyparallel import Client


#  Let's parallelize this
c = Client()
dv = c[:]
dv.block = False
dv
lview = c.load_balanced_view()

In [11]:
yes = 0
for company in tqdm(companies):
    nav, path = get_texts_for_domain(company['path_to'], None, file_name='keywords.txt')
    if nav:
        yes += 1

yes / len(companies)

# 64% nav
# 93% titles
# 60% descriptions
# 32% keywords

100%|██████████| 171266/171266 [00:16<00:00, 10355.58it/s]


0.3290962596195392

In [6]:
# Parallel processing of meta tags (to speed up further computing)
from tqdm import tqdm, tqdm_notebook

results = []
for company in tqdm(companies):
    text = ''
    path = company['path_to']
    ar = lview.apply(get_texts_for_domain, path, get_full_text, file_name='full_texts.txt', force_read=True)
    results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_meta_descriptions_text, file_name='descriptions.txt')
#     results.append(ar)
    ar = lview.apply(get_texts_for_domain, path, get_meta_keywords_text, file_name='keywords.txt', force_read=True)
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_titles_text, file_name='titles.txt')
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_nav_text, file_name='nav_menu.txt', force_read=True)
    results.append(ar)

for r in tqdm(results):
    r.get()

100%|██████████| 171266/171266 [2:50:24<00:00, 16.75it/s]
100%|██████████| 342532/342532 [01:59<00:00, 2860.47it/s]


# Get bag of words for websites

In [None]:
from tqdm import tqdm, tqdm_notebook
from sklearn.utils import shuffle

# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:100000]
target = []

texts = []
for company in tqdm(train_data):
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_full_text)
        
#         if text:
#             texts.append(file_path)
#             target.append(company['industry'])
        path = file_path if text else '/dev/null'
        texts.append(path)
        target.append(company['industry'])
    except Exception as e:
        print(e)
        logger.error(str(e))
    
# Accuracy about 47%. 48% with removed scripts
# 42% with balanced classifier
# 46% with ngrams 45% 1-4
# 45% lemmatize


1. Precision, Recall, curve (ROC), lift curve, confusion matrix, mogoce tudi clustering confusion matrixa
2. Rocno dolocanje napake
3. Stacking (potrebujem verjetnost)
4. 

# Get bag of words on nav menus

In [None]:
from tqdm import tqdm
from sklearn.utils import shuffle

# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:160000]
target = []

texts = []
for company in tqdm(train_data):
    text = ''
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_nav_text, file_name='nav_menu.txt')
        # If we can't extract, fallback to full website
#         if not text:
#             text, file_path = get_texts_for_domain(path, get_full_text)
        if text:
            target.append(company['industry'])
            texts.append(file_path)
    except Exception as e:
        print(e)
        logger.error(str(e))

# 36% with replacing when no nav menu
# Same with stop words and ngrams
# 30% balanced

# Get bag of words on meta tags

In [None]:
from tqdm import tqdm
from sklearn.utils import shuffle


# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:160000]
target = []

texts = []
for company in tqdm(train_data):
    text = ''
    path = company['path_to']
    try:
#         text, file_path = get_texts_for_domain(path, get_meta_descriptions_text, file_name='descriptions.txt')
#         text, file_path = get_texts_for_domain(path, get_titles_text, file_name='titles.txt')
        text, file_path = get_texts_for_domain(path, get_meta_keywords_text, file_name='keywords.txt')
        if text:
            target.append(company['industry'])
            texts.append(file_path)
    except Exception as e:
        logger.error(str(e))

# Accuracy about 41% for descriptions
# 35% for titles
# 36% for keywords
# Problem with all of these is that the accuracy is less, because a lot don't have these meta tags


In [None]:
# Create transformers
from sklearn.preprocessing import FunctionTransformer

def extract_from_company(companies, func, file_name=None):
    paths = []
    for company in companies:
        text, file_path = get_texts_for_domain(company['path_to'], func, file_name)
        path = file_path if text else '/dev/null'
        paths.append(path)
    print(len(paths))
    return paths

def get_full_text_from_company(companies):
    return extract_from_company(companies, get_full_text)

def get_nav_menus_from_company(companies):
    return extract_from_company(companies, get_nav_text, file_name='nav_menu.txt')

def get_descriptions_from_company(companies):
    return extract_from_company(companies, get_meta_descriptions_text, file_name='descriptions.txt')

def get_titles_from_company(companies):
    return extract_from_company(companies, get_titles_text, file_name='titles.txt')


full_text_transformer = FunctionTransformer(get_full_text_from_company, validate=False)
nav_menus_transformer = FunctionTransformer(get_nav_menus_from_company, validate=False)
descriptions_transformer = FunctionTransformer(get_descriptions_from_company, validate=False)
titles_transformer = FunctionTransformer(get_titles_from_company, validate=False)

# Extract different features from websites

In [12]:
from tqdm import tqdm
import numpy as np
from sklearn.utils import shuffle
from ipyparallel import require


companies = shuffle(companies)
train_data = companies[:120000]
target = []

features = []

@require(sites_for_domain, get_soup_from_html_gz)
def get_features(path):
    image_count = 0
    sites = 0
    form_count = 0
    a_count = 0
    link_count = 0
    meta_count = 0
    for site_path in sites_for_domain(path):
        sites += 1
        soup = get_soup_from_html_gz(site_path)
        image_count += len(soup.find_all('img'))
        form_count += len(soup.find_all('form'))
        a_count += len(soup.find_all('a'))
        link_count += len(soup.find_all('link'))
        meta_count += len(soup.find_all('meta'))
    avg_image_count = image_count / sites if sites else 0
    return [sites, image_count, avg_image_count, form_count, link_count, meta_count]


results = []
for i, company in enumerate(tqdm(train_data)):
#     if company['founded'] > 1800:
    path = company['path_to']
    ar = lview.apply_async(get_features, path)
    results.append(ar)

for i, company in enumerate(tqdm(train_data)):
    try:
        features.append(results[i].get())
#         age = 'young' if company['founded'] > 2010 else 'old'
#         target.append(age)
        target.append(company['company_size_clean'])
    except Exception as e:
        print(e)

# Accuracy 25% for company_size
# 26% for balanced classifier


100%|██████████| 120000/120000 [20:07<00:00, 99.37it/s]
 23%|██▎       | 27820/120000 [00:00<00:01, 90085.10it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 8606: invalid continuation byte)
UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 14263: invalid continuation byte)


 50%|█████     | 60093/120000 [01:36<1:47:55,  9.25it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 11803: invalid continuation byte)


 71%|███████▏  | 85614/120000 [10:19<51:44, 11.08it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 120756: invalid continuation byte)


 82%|████████▏ | 98001/120000 [14:27<29:56, 12.25it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 24177: invalid continuation byte)


 90%|████████▉ | 107490/120000 [17:33<15:58, 13.06it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 6738: invalid continuation byte)


 96%|█████████▌| 115469/120000 [20:17<13:48,  5.47it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 2957: invalid continuation byte)


100%|██████████| 120000/120000 [21:44<00:00, 91.99it/s]


In [24]:
features[0]

[5, 63, 12.6, 0, 26, 5]

In [95]:
# Try SGD Classifier
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# clf = SGDRegressor(loss='huber', penalty='l2', alpha=0.001,
#                                   epsilon=0.25)

clf = SGDClassifier(loss='hinge', n_jobs=6, alpha=0.1, epsilon=1)

scaler.fit(features)
# clf.fit(scaler.transform(features), [sizes_cont[c] for c in target])
# clf.fit(scaler.transform(features), target)
# clf.fit(features, target)  # 27% - 62%
clf.fit(features, [sizes_small[c] for c in target])

SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=6,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [91]:
from sklearn.grid_search import GridSearchCV

params = {
    'alpha': [0.1, 0.001, 0.0001],
    'epsilon': [1, 1.5, 5]
}

grid = GridSearchCV(clf, param_grid=params, n_jobs=5)
grid.fit(features, [sizes_small[c] for c in target])

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=6,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'alpha': [0.1, 0.001, 0.0001], 'epsilon': [1, 1.5, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [73]:
from sklearn.grid_search import GridSearchCV

params = {
    'alpha': [0.1, 0.001, 0.0001],
    'epsilon': [1, 1.5, 5]
}

grid = GridSearchCV(clf, param_grid=params, n_jobs=5, scoring='mean_absolute_error')
grid.fit(scaler.transform(features), [sizes_cont[c] for c in target])

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDRegressor(alpha=0.001, average=False, epsilon=0.25, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='huber', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'alpha': [0.1, 0.001, 0.0001], 'epsilon': [1, 1.5, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=0)

In [92]:
grid.grid_scores_

[mean: 0.63213, std: 0.00097, params: {'alpha': 0.1, 'epsilon': 1},
 mean: 0.57112, std: 0.04588, params: {'alpha': 0.1, 'epsilon': 1.5},
 mean: 0.60108, std: 0.03052, params: {'alpha': 0.1, 'epsilon': 5},
 mean: 0.47769, std: 0.03408, params: {'alpha': 0.001, 'epsilon': 1},
 mean: 0.53987, std: 0.08037, params: {'alpha': 0.001, 'epsilon': 1.5},
 mean: 0.58302, std: 0.04839, params: {'alpha': 0.001, 'epsilon': 5},
 mean: 0.46948, std: 0.15469, params: {'alpha': 0.0001, 'epsilon': 1},
 mean: 0.62754, std: 0.00392, params: {'alpha': 0.0001, 'epsilon': 1.5},
 mean: 0.40717, std: 0.15753, params: {'alpha': 0.0001, 'epsilon': 5}]

In [None]:
# Try RandomForests
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=6)

rf.fit(features, target)  # 26% - 67%

In [82]:
texts = []
# Test on 1000 samples
test_companies = companies[-10000:]
results = []

features_test = []
target_test = []

# for i, company in enumerate(tqdm(test_companies)):
#     path = company['path_to']
#     ar = lview.apply_async(get_features, path)
#     results.append(ar)

# for i, company in enumerate(tqdm(test_companies)):
#     try:
#         features_test.append(results[i].get())
# #         age = 'young' if company['founded'] > 2010 else 'old'
# #         target_test.append(age)
#         target_test.append(company['company_size_clean'])
#     except Exception as e:
#         print(e)

# score = clf.score(scaler.transform(features_test), [sizes_cont[c] for c in target_test])
# score = clf.score(features_test, target_test)
# score = rf.score(features_test, target_test)
score = clf.score(features_test, [sizes_small[c] for c in target_test])

# Mean absolute error for founded - 21.38 years
# F1 for young vs old - 32% (72% recall, 43% accuracy)
# Mean absolute error for company size - 657.78

100%|██████████| 10000/10000 [00:58<00:00, 170.13it/s]
 87%|████████▋ | 8661/10000 [03:22<00:37, 36.03it/s]

UnicodeDecodeError('utf-8' codec can't decode byte 0xed in position 17238: invalid continuation byte)


100%|██████████| 10000/10000 [03:52<00:00, 43.00it/s]


In [96]:
from sklearn.metrics import classification_report

expected = [sizes_small[c] for c in target_test]
predicted = clf.predict(features_test)
# predicted = ['Not' if p > 0 else 'Food & Beverages' for p in predicted_prob]

print("Classification report for classifier:\n%s\n"
      % classification_report(expected, predicted))


Classification report for classifier:
             precision    recall  f1-score   support

        big       0.12      0.00      0.01      1036
     medium       0.30      0.39      0.34      2618
      small       0.66      0.68      0.67      6345

avg / total       0.51      0.53      0.51      9999




In [80]:
score

0.58879439719859927

In [None]:
clf.decision_function(features_test)

In [None]:
from sklearn.metrics import recall_score, accuracy_score, f1_score

# predicted = clf.predict(scaler.transform(features_test))
predicted = clf.predict(features_test)
predicted = ['old' if a < -24 else 'young' for a in clf.decision_function(features_test)]

print(recall_score(target_test, predicted, pos_label='young'))
print(accuracy_score(target_test, predicted))
print(f1_score(target_test, predicted, pos_label='young'))

# sum([1 if a == 'old' else 0 for a in predicted])


In [54]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error([sizes_cont[c] for c in target_test], clf.predict(scaler.transform(features_test)))

434.32175009589594

In [72]:

predicted_cont = grid.predict(scaler.transform(features_test))
predicted = []
for pc in predicted_cont:
    p = '10,001+'
    for s in inv_sizes:
        if pc <= s[0]:
            p = s[1]
            break
    predicted.append(p)

correct = 0
for actual, predict in zip(target_test, predicted):
    actual_c = sizes.get(actual)
    predict_c = sizes.get(predict)
    if abs(predict_c - actual_c) < 2:
        correct += 1

correct/len(target_test)

0.7788894447223612

In [88]:
sizes = {
    '1': 1,
    '1-10': 2,
    '11-50': 3,
    '51-200': 4,
    '201-500': 5,
    '501-1000': 6,
    '1001-5000': 7,
    '5001-10,000': 8,
    '10,001+': 9,
}

sizes_small = {
    '1': 'small',
    '1-10': 'small',
    '11-50': 'small',
    '51-200': 'medium',
    '201-500': 'medium',
    '501-1000': 'big',
    '1001-5000': 'big',
    '5001-10,000': 'big',
    '10,001+': 'big',
}

sizes_cont = {
    '1': 1,
    '1-10': 5,
    '11-50': 30,
    '51-200': 125,
    '201-500': 350,
    '501-1000': 750,
    '1001-5000': 3000,
    '5001-10,000': 7500,
    '10,001+': 10000,
}

inv_sizes = [
    (1, '1'),
    (10, '1-10'),
    (50, '11-50'),
    (200, '51-200'),
    (500, '201-500'),
    (1000, '501-1000'),
    (5000, '1001-5000'),
    (10000, '5001-10,000')
            ]

In [18]:
target_test[143]

'1-10'

In [19]:
# Score not so harshly
correct = 0

predicted = clf.predict(features_test)
for actual, predict in zip(target_test, predicted):
    actual_c = sizes.get(actual)
    predict_c = sizes.get(predict)
    if abs(predict_c - actual_c) < 2:
        correct += 1

correct/len(target_test)

0.6818409204602301

In [None]:
# See the mistakes
for i, com in enumerate(target_test[:200]):
    predicted = clf.predict([features_test[i]])[0]
    actual = com['company_size_clean']
    if not predicted == actual:
        print("Predicted - Actual: %s - %s" % (predicted, actual))

In [None]:
# See the industries
from collections import Counter

industries = Counter([c['industry'] for c in companies])
industries.most_common()

In [None]:
for k in sorted(industries.most_common()):
    print(k[0])

In [None]:
predicted