# Features
There are many features that we want to extract from the websites:

* Size (total, text, css, js) - we have to ignore css and js for now, since we don't have them
* Number of images
* Number of forms
* Number of mailto links
* Bag of words (the whole website)
* Only specific words (based on tags)
* Number of items in navigation menu
* Navigation menu
* Meta description
* Meta keywords

Optionally, we might want to extend this to:

* Topics (that we extract)
* Genres of the sites (extracted as well)
* Check for iframes, Bootstrap, React, etc.
* Check for Google Analytics
* Check for Facebook/Twitter/Google+ meta elements
* How much are HTML5 and CSS3 elements used?

# Load dataset

In [None]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [None]:
from collections import Counter

# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Let's also limit the number of sites (because this is not cleaned yet)
companies = [c for c in companies if c.get('current_site_count', 0) < 100]

# Let's use only well represented industries
industries = Counter([c['industry'] for c in companies])
companies = [c for c in companies if industries[c.get('industry')] > 500]

## Some helper functions

In [None]:
import os
import gzip

import scandir
from bs4 import BeautifulSoup

from ipyparallel import require

import logging

# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('extract_features.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)


@require(BeautifulSoup, 'gzip')
def get_soup_from_html_gz(path):
    with gzip.open(path, 'rt') as f:  # TODO Apparently, not all files are utf-8 encoded? :o
        soup = BeautifulSoup(f, 'html.parser')
    return soup


@require('os', save_texts_for_domain)
def get_texts_for_domain(path, get_text, force_read=False, file_name=None):
    file_name = file_name or 'full_texts.txt'
    text_file = os.path.join(path, file_name)
    if not os.path.exists(text_file) or force_read:
        text = get_text(path)
        text_file = save_texts_for_domain(text, path, file_name=file_name)
    else:
        with open(text_file, 'r') as f:
            text = f.read()
    return text, text_file


@require(get_soup_from_html_gz, sites_for_domain)
def get_full_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        body = get_soup_from_html_gz(site_path).body
        if body:
            text += body.get_text()
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain)
def get_nav_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_nav_menu(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


@require('os')
def save_texts_for_domain(text, path, file_name=None):
    file_name = file_name or 'full_texts.txt'
    file_name = os.path.join(path, file_name)
    with open(file_name, 'w') as f:
        f.write(text)
    return file_name


@require('scandir', 'os')
def sites_for_domain(path):
    for dirpath, dirs, file_names in scandir.walk(path):
        for file_name in file_names:
            if file_name.endswith('.gz'):
                yield os.path.join(dirpath, file_name)

                
def get_nav_menu(soup):
    if not soup.body:
        return None
    # We expect nav menu in header
    # First look for possible headers
    header = soup.body.find_all('header')
    if not header:
        header = soup.body.find_all({'class': ['header', 'menu']})
    if not header:
        header = soup.body.find_all(id='header')
    
    # Then look for nav menu in that header
    if header:
        header = header[0]
        has_header = True
    # If not found, look for a menu anywhere
    else:
        has_header = False
        header = soup.body
        
    nav = header.find('nav')
    if not nav:
        nav = soup.body.find('nav')
    if not nav:
        nav = header.find(class_='nav')
    if not nav:
        nav = header.find(id='nav')
    
    if nav:
        return [item for li in nav.find_all('li') for item in li.stripped_strings]  # TODO this doesn't deal with submenus
    elif has_header:
        return [item for li in header.find_all('li') for item in li.stripped_strings]
    else:
        return []

    

## Set up parallel computing

In [None]:
from ipyparallel import Client


#  Let's parallelize this
c = Client()
dv = c[:]
dv.block = False
dv
lview = c.load_balanced_view()

# Get bag of words for websites

In [None]:
from tqdm import tqdm, tqdm_notebook
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.utils import shuffle


# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:150000]
target = []

texts = []
hv = HashingVectorizer(non_negative=True, input='filename', stop_words='english', ngram_range=(1,2))
for company in tqdm(train_data):
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_full_text)
    
        texts.append(file_path)
        target.append(company['industry'])
    except Exception as e:
        print(e)
        logger.error(str(e))

try:
    bag = hv.transform(texts)
    del texts
    tf_transformer = TfidfTransformer()
    bag_train_tf = tf_transformer.fit_transform(bag)
except Exception as e:
    print(e)
    logger.error(str(e))
    
# Accuracy about 43%
# ngrams, stop_words 44%


In [None]:
len(companies)

In [None]:
tf_transformer = TfidfTransformer()
bag_train_tf = tf_transformer.fit_transform(bag)

## Get bag of words on nav menus

In [None]:
from tqdm import tqdm
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.utils import shuffle


# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:160000]
target = []

texts = []
hv = HashingVectorizer(non_negative=True, input='filename')
for company in tqdm(train_data):
    text = ''
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_nav_text)
        
        if text:
            target.append(company['industry'])
            texts.append(file_path)
    except Exception as e:
        print(e)
        logger.error(str(e))

bag = hv.transform(texts)
del texts

tf_transformer = TfidfTransformer()
bag_train_tf = tf_transformer.fit_transform(bag)

# Accuracy about 61%


In [None]:
# Try SGD Classifier
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', n_iter=100, alpha=0.01)
clf.fit(bag_train_tf, target)

In [None]:
print(1)

In [None]:
# Try naive bayes classifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(bag_train_tf, target)

In [None]:
texts = []
# Test on 1000 samples
test_companies = companies[-2000:]
tested = []
for company in tqdm(test_companies):
    text = ''
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_nav_text)

        texts.append(file_path)
        tested.append(company)
    except Exception as e:
        print(e)
bag = hv.transform(texts)
del texts

bag_test_tf = tf_transformer.transform(bag)

predicted = clf.predict(bag_test_tf)
score = clf.score(bag_test_tf, [c['industry'] for c in tested])

In [None]:
score

In [None]:
correct = 0
similar = {}
for company, predict in zip(tested, predicted):
    if company['industry'] == predict or company['industry'] in similar.get(predict, []) or predict in similar.get(company['industry'], []):
        correct += 1
    else:
        print('%s - predicted %s' % (company['industry'], predict))
#         add = input('Add? yN ')
        add = 'n'
        if add == 'y':
            s = similar.get(company['industry'], [])
            s.append(predict)
            similar[company['industry']] = s
            
            s = similar.get(predict, [])
            s.append(company['industry'])
            similar[predict] = s
            correct += 1

correct

# Extract different features from websites

In [None]:
import numpy as np

a = np.ones([10, 3])
a[1, :] = [0, 0, 0]
a

In [None]:
from tqdm import tqdm
import numpy as np
from ipyparallel import require


companies = shuffle(companies)
train_data = companies[:100]
target = []

n_features = 4
features = np.ones([len(train_data), n_features])


@require(sites_for_domain, get_soup_from_html_gz)
def get_features(path):
    image_count = 0
    sites = 0
    form_count = 0
    for site_path in sites_for_domain(path):
        sites += 1
        soup = get_soup_from_html_gz(site_path)
        image_count += len(soup.find_all('img'))
        form_count += len(soup.find_all('form'))
    avg_image_count = image_count / sites if sites else 0
    return [sites, image_count, avg_image_count, form_count]


results = []
for i, company in enumerate(tqdm(train_data)):
    path = company['path_to']
    ar = lview.apply_async(get_features, path)
    results.append(ar)

for i, company in enumerate(tqdm(train_data)):
    features[i, :] = results[i].get()



# Extract navigational menu items

In [None]:
from tqdm import tqdm

nav = {}
for company in tqdm(companies[100:200]):
    path = company['path_to']
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        nav_menu = get_nav_menu(soup)
        if nav_menu:
            nav[company['company_name']] = nav_menu
            break


In [None]:
nav

In [None]:
path_to = companies[107]['path_to']
sites = sites_for_domain(path_to)
path = next(sites)
soup = get_soup_from_html_gz(path)
get_nav_menu(soup)

In [None]:
len([n for n, na in nav.items() if na])

In [None]:
from collections import Counter

c = Counter([c['industry'] for c in train_data])
c.most_common()