# Features
There are many features that we want to extract from the websites:

* Size (total, text, css, js) - we have to ignore css and js for now, since we don't have them
* Number of images
* Number of forms
* Number of mailto links
* Bag of words (the whole website)
* Only specific words (based on tags)
* Number of items in navigation menu
* Navigation menu
* Meta description
* Meta keywords

Optionally, we might want to extend this to:

* Topics (that we extract)
* Genres of the sites (extracted as well)
* Check for iframes, Bootstrap, React, etc.
* Check for Google Analytics
* Check for Facebook/Twitter/Google+ meta elements
* How much are HTML5 and CSS3 elements used?

# Load dataset

In [None]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [None]:
from collections import Counter

# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Let's also limit the number of sites (because this is not cleaned yet)
companies = [c for c in companies if c.get('current_site_count', 0) < 100]

# Let's use only well represented industries
industries = Counter([c['industry'] for c in companies])
companies = [c for c in companies if industries[c.get('industry')] > 500]

## Some helper functions

In [None]:
import os
import gzip

import scandir
from bs4 import BeautifulSoup

from ipyparallel import require

import logging

# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('extract_features.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)


@require(BeautifulSoup, 'gzip')
def get_soup_from_html_gz(path):
    with gzip.open(path, 'rt') as f:  # TODO Apparently, not all files are utf-8 encoded? :o
        soup = BeautifulSoup(f, 'html.parser')
    return soup


@require('os')
def save_texts_for_domain(text, path, file_name=None):
    file_name = file_name or 'full_texts.txt'
    file_name = os.path.join(path, file_name)
    with open(file_name, 'w') as f:
        f.write(text)
    return file_name


@require('scandir', 'os')
def sites_for_domain(path):
    for dirpath, dirs, file_names in scandir.walk(path):
        for file_name in file_names:
            if file_name.endswith('.gz'):
                yield os.path.join(dirpath, file_name)


def get_nav_menu(soup):
    if not soup.body:
        return None
    # We expect nav menu in header
    # First look for possible headers
    header = soup.body.find_all('header')
    if not header:
        header = soup.body.find_all({'class': ['header', 'menu']})
    if not header:
        header = soup.body.find_all(id='header')
    
    # Then look for nav menu in that header
    if header:
        header = header[0]
        has_header = True
    # If not found, look for a menu anywhere
    else:
        has_header = False
        header = soup.body
        
    nav = header.find('nav')
    if not nav:
        nav = soup.body.find('nav')
    if not nav:
        nav = header.find(class_='nav')
    if not nav:
        nav = header.find(id='nav')
    if not nav:
        nav = header.find(class_='menu')
    if not nav:
        nav = header.find(class_='main-menu')

    if nav:
        return [item for li in nav.find_all('li') for item in li.stripped_strings]  # TODO this doesn't deal with submenus
    elif has_header:
        return [item for li in header.find_all('li') for item in li.stripped_strings]
    else:
        return []


@require('os', save_texts_for_domain)
def get_texts_for_domain(path, get_text, force_read=False, file_name=None):
    file_name = file_name or 'full_texts.txt'
    text_file = os.path.join(path, file_name)
    if not os.path.exists(text_file) or force_read:
        try:
            text = get_text(path)
            text_file = save_texts_for_domain(text, path, file_name=file_name)
        except Exception as e:
            print(e)
            text = None
            text_file = ''
    else:
        with open(text_file, 'r') as f:
            text = f.read()
    return text, text_file


@require(get_soup_from_html_gz, sites_for_domain)
def get_full_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        body = get_soup_from_html_gz(site_path).body
        if body:
            text += body.get_text()
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_nav_menu)
def get_nav_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_nav_menu(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


def get_meta_keywords(soup):
    keywords = soup.find('meta', attrs={'name':'keywords'})
    if keywords:
        return [k.strip() for k in keywords.get('content', '').split(',')]
    else:
        return None


def get_meta_description(soup):
    description = soup.find('meta', attrs={'name':'description'})
    if description:
        return description.get('content')
    else:
        return None


def get_site_title(soup):
    title = soup.find('title')
    if title:
        return title.string
    else:
        return None


@require(get_soup_from_html_gz, sites_for_domain, get_meta_keywords)
def get_meta_keywords_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        text = get_meta_keywords(soup)
        if text:
            text = ' '.join(text)
            break
        else:
            text = ''
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_site_title)
def get_titles_text(path):
    text = ''
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        title = get_site_title(soup)
        if title:
            text += title
            text += '\n'
    return text


@require(get_soup_from_html_gz, sites_for_domain, get_meta_description)
def get_meta_descriptions_text(path):
    text = ''
    prev_description = None
    for site_path in sites_for_domain(path):
        soup = get_soup_from_html_gz(site_path)
        description = get_meta_description(soup)
        if description:
            if prev_description != description:
                prev_description = description
                text += description
                text += '\n'
    return text

## Set up parallel computing

In [None]:
from ipyparallel import Client


#  Let's parallelize this
c = Client()
dv = c[:]
dv.block = False
dv
lview = c.load_balanced_view()

In [None]:
# Parallel processing of meta tags (to speed up further computing)
from tqdm import tqdm, tqdm_notebook

results = []
for company in tqdm(companies):
    text = ''
    path = company['path_to']
#     ar = lview.apply(get_texts_for_domain, path, get_meta_descriptions_text, file_name='descriptions.txt')
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_meta_keywords_text, file_name='keywords.txt')
#     results.append(ar)
#     ar = lview.apply(get_texts_for_domain, path, get_titles_text, file_name='titles.txt')
#     results.append(ar)
    ar = lview.apply(get_texts_for_domain, path, get_nav_text, file_name='nav_menu.txt', force_read=True)
    results.append(ar)


# Get bag of words for websites

In [None]:
from tqdm import tqdm, tqdm_notebook
from sklearn.utils import shuffle

# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:120000]
target = []

texts = []
for company in tqdm(train_data):
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_full_text)
        
        if text:
            texts.append(file_path)
            target.append(company['industry'])
    except Exception as e:
        print(e)
        logger.error(str(e))
    
# Accuracy about 47%. 48% with removed scripts
# 42% with balanced classifier
# 46% with ngrams 45% 1-4
# 45% lemmatize


1. Precision, Recall, curve (ROC), lift curve, confusion matrix, mogoce tudi clustering confusion matrixa
2. Rocno dolocanje napake
3. Stacking (potrebujem verjetnost)
4. 

# Get bag of words on nav menus

In [None]:
from tqdm import tqdm
from sklearn.utils import shuffle

# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:160000]
target = []

texts = []
for company in tqdm(train_data):
    text = ''
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_nav_text, file_name='nav_menu.txt')
        # If we can't extract, fallback to full website
#         if not text:
#             text, file_path = get_texts_for_domain(path, get_full_text)
        if text:
            target.append(company['industry'])
            texts.append(file_path)
    except Exception as e:
        print(e)
        logger.error(str(e))

# 36% with replacing when no nav menu
# Same with stop words and ngrams
# 30% balanced

# Get bag of words on meta tags

In [None]:
from tqdm import tqdm
from sklearn.utils import shuffle


# Let's shuffle the data first
companies = shuffle(companies)
train_data = companies[:160000]
target = []

texts = []
for company in tqdm(train_data):
    text = ''
    path = company['path_to']
    try:
#         text, file_path = get_texts_for_domain(path, get_meta_descriptions_text, file_name='descriptions.txt')
#         text, file_path = get_texts_for_domain(path, get_titles_text, file_name='titles.txt')
        text, file_path = get_texts_for_domain(path, get_meta_keywords_text, file_name='keywords.txt')
        if text:
            target.append(company['industry'])
            texts.append(file_path)
    except Exception as e:
        logger.error(str(e))

# Accuracy about 41% for descriptions
# 35% for titles
# 36% for keywords
# Problem with all of these is that the accuracy is less, because a lot don't have these meta tags


# Classify

## Create a pipe for transformer + classifier

### Bag of Words (HashingVectorizer + tfid) with SVC

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

hv = HashingVectorizer(non_negative=True, input='filename')
tf_transformer = TfidfTransformer()
clf = SGDClassifier(loss='hinge', n_iter=100, alpha=0.01, n_jobs=5)

pipe = Pipeline([('HV', hv), ('tfid', tf_transformer), ('svc', clf)])

## Train

In [None]:
pipe.fit(texts, target)

## Test

In [None]:
texts = []
# Test on 1000 samples
test_companies = companies[-8000:]
tested = []
for company in tqdm(test_companies):
    text = ''
    path = company['path_to']
    try:
        text, file_path = get_texts_for_domain(path, get_nav_text, file_name='nav_menu.txt')
#         text, file_path = get_texts_for_domain(path, get_full_text)
#         text, file_path = get_texts_for_domain(path, get_meta_descriptions_text, file_name='descriptions.txt')
#         text, file_path = get_texts_for_domain(path, get_titles_text, file_name='titles.txt')
#         text, file_path = get_texts_for_domain(path, get_meta_keywords_text, file_name='keywords.txt')
#         if not text:
#             text, file_path = get_texts_for_domain(path, get_full_text)
        if text:
            texts.append(file_path)
            tested.append(company)
    except Exception as e:
        print(e)

score = pipe.score(texts, [c['industry'] for c in tested])

In [None]:
score

In [None]:
predicted = pipe.predict(texts)

In [None]:
expected = [c['industry'] for c in tested]

### Evaluate results

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score

print("Classification report for classifier:\n%s\n"
      % classification_report(expected, predicted))
print("Confusion matrix:\n%s" % confusion_matrix(expected, predicted))

In [None]:
expected_count = Counter(expected)
results = {}
for num, example in enumerate(expected):
    if example == predicted[num]:
        results[example] = results.get(example, 0) + 1
    elif predicted[num] in similar_industries.get(example, []):
        results[example] = results.get(example, 0) + 0.5

for key, value in results.items():
    results[key] = value / expected_count[key]

print(recall_score(expected, predicted, average='weighted'))
sum([v * expected_count[key] / len(expected) for key, v in results.items()])

In [None]:
results

#### What are we confusing for what?

In [None]:
similar_industries = {}
skipped = {}

In [None]:
# Use this to determine if a mistake is big or small
similar_industries = {'Accounting': ['Financial Services'],
 'Airlines/Aviation': ['Aviation & Aerospace', 'Leisure, Travel & Tourism'],
 'Apparel & Fashion': ['Consumer Goods', 'Design', 'Luxury Goods & Jewelry', 'Retail'],
 'Architecture & Planning': ['Construction', 'Furniture'],
 'Automotive': ['Machinery', 'Mechanical or Industrial Engineering'],
 'Aviation & Aerospace': ['Airlines/Aviation',
  'Leisure, Travel & Tourism',
  'Mechanical or Industrial Engineering'],
 'Banking': ['Financial Services'],
 'Broadcast Media': ['Newspapers'],
 'Building Materials': ['Construction'],
 'Business Supplies and Equipment': ['Furniture', 'Printing', 'Retail'],
 'Chemicals': ['Cosmetics', 'Plastics'],
 'Civic & Social Organization': ['Government Administration',
  'Nonprofit Organization Management'],
 'Civil Engineering': ['Architecture & Planning',
  'Construction',
  'Renewables & Environment'],
 'Computer Games': ['Information Technology and Services', 'Internet'],
 'Computer Software': ['Computer Games',
  'Information Services',
  'Information Technology and Services',
  'Internet',
  'Research'],
 'Construction': ['Architecture & Planning',
  'Civil Engineering',
  'Real Estate',
  'Building Materials'],
 'Consumer Electronics': ['Business Supplies and Equipment',
  'Electrical/Electronic Manufacturing',
  'Entertainment',
  'Information Technology and Services'],
 'Consumer Goods': ['Computer Games',
  'Consumer Services',
  'Food & Beverages',
  'Wine and Spirits',
                   'Retail', 'Wholesale'],
 'Consumer Services': ['Hospitality',
  'Leisure, Travel & Tourism',
  'Retail',
  'Wholesale'],
 'Cosmetics': ['Health, Wellness and Fitness', 'Hospital & Health Care'],
 'Design': ['Apparel & Fashion', 'Photography'],
 'E-Learning': ['Computer Games',
  'Computer Software',
  'Education Management',
  'Information Technology and Services',
  'Higher Education',
  'Internet'],
 'Education Management': ['Management Consulting', 'Higher Education'],
 'Electrical/Electronic Manufacturing': ['Consumer Electronics',
  'Industrial Automation'],
 'Entertainment': ['Apparel & Fashion',
  'Broadcast Media',
  'Computer Games',
  'Events Services',
  'Hospitality',
  'Leisure, Travel & Tourism',
  'Motion Pictures and Film',
  'Music',
  'Performing Arts'],
 'Environmental Services': ['Renewables & Environment'],
 'Events Services': ['Entertainment',
  'Food & Beverages',
  'Hospitality',
  'Leisure, Travel & Tourism',
  'Music',
  'Restaurants'],
 'Financial Services': ['Accounting',
  'Banking',
  'Insurance',
  'Legal Services'],
 'Food & Beverages': ['Events Services',
  'Food Production',
  'Restaurants',
  'Wine and Spirits'],
 'Food Production': ['Consumer Goods', 'Food & Beverages', 'Wine and Spirits'],
 'Furniture': ['Architecture & Planning'],
 'Government Administration': ['Civic & Social Organization',
  'Education Management',
  'Environmental Services',
  'Higher Education',
  'Hospital & Health Care',
  'Renewables & Environment'],
 'Graphic Design': ['Design', 'Printing'],
 'Health, Wellness and Fitness': ['Cosmetics',
  'Hospital & Health Care',
  'Hospitality',
  'Leisure, Travel & Tourism'],
 'Higher Education': ['Education Management', 'Research'],
 'Hospital & Health Care': ['Health, Wellness and Fitness',
  'Individual & Family Services',
  'Medical Devices'],
 'Hospitality': ['Events Services', 'Leisure, Travel & Tourism'],
 'Human Resources': ['Government Administration',
  'Professional Training & Coaching',
  'Staffing and Recruiting'],
 'Industrial Automation': ['Mechanical or Industrial Engineering'],
 'Information Services': ['Computer Software',
  'E-Learning',
  'Information Technology and Services',
  'Internet'],
 'Information Technology and Services': ['Computer Games',
  'Computer Software',
  'Information Services',
                                        'Internet'],
 'Insurance': ['Banking', 'Financial Services'],
 'International Trade and Development': ['Government Administration'],
 'Internet': ['Computer Software',
  'Information Services',
  'Information Technology and Services'],
 'Investment Management': ['Financial Services', 'Insurance'],
 'Law Practice': ['Legal Services'],
 'Legal Services': ['Law Practice'],
 'Leisure, Travel & Tourism': ['Entertainment', 'Museums and Institutions'],
 'Luxury Goods & Jewelry': ['Apparel & Fashion'],
 'Machinery': ['Automotive',
  'Mechanical or Industrial Engineering',
  'Mining & Metals'],
 'Management Consulting': ['Professional Training & Coaching'],
 'Maritime': ['Leisure, Travel & Tourism'],
 'Mechanical or Industrial Engineering': ['Airlines/Aviation',
  'Automotive',
  'Industrial Automation',
  'Plastics',
  'Research',
                                         'Machinery'],
 'Media Production': ['Broadcast Media',
  'Motion Pictures and Film',
  'Newspapers'],
 'Medical Devices': ['Hospital & Health Care'],
 'Mining & Metals': ['Building Materials', 'Machinery', 'Oil & Energy'],
 'Motion Pictures and Film': ['Entertainment',
  'Media Production',
  'Photography'],
 'Newspapers': ['Broadcast Media'],
 'Nonprofit Organization Management': ['Civic & Social Organization'],
 'Oil & Energy': ['Chemicals', 'Mining & Metals'],
 'Online Media': ['Entertainment', 'Media Production', 'Newspapers'],
 'Pharmaceuticals': ['Cosmetics', 'Hospital & Health Care'],
 'Printing': ['Business Supplies and Equipment'],
 'Professional Training & Coaching': ['Education Management',
  'Human Resources', 'Management Consulting'],
 'Public Relations and Communications': ['Marketing and Advertising'],
 'Publishing': ['Media Production', 'Printing'],
 'Renewables & Environment': ['Environmental Services'],
                      'Research': ['Higher Education'],
 'Restaurants': ['Food & Beverages', 'Hospitality'],
 'Retail': ['Apparel & Fashion'],
 'Security and Investigations': ['Information Technology and Services'],
 'Sports': ['Health, Wellness and Fitness', 'Leisure, Travel & Tourism'],
 'Staffing and Recruiting': ['Human Resources', 'Management Consulting'],
 'Telecommunications': ['Information Technology and Services'],
 'Transportation/Trucking/Railroad': ['Automotive',
  'Logistics and Supply Chain']}

In [None]:
industries_names = sorted(list(industries.keys()))
confusion = confusion_matrix(expected, predicted, labels=industries_names)

for true_index, row in enumerate(confusion):
    act = industries_names[true_index]
    print('******** %s ********' % act)
    for predicted_index, value in enumerate(row):
        if true_index != predicted_index and value > 2:
            pred = industries_names[predicted_index]
#             print("%s but predicted %s: %s times" % (act, pred, value))
            if pred not in similar_industries.get(act, []):# and pred not in skipped.get(act, []):
                print("%s but predicted %s: %s times" % (act, pred, value))
#                 add = input('Add %s? ' % pred)
#                 if add == 'y':
#                     similar_industries[act] = similar_industries.get(act, []) + [pred]
#                     print('Added %s' % pred)
#                 else:
#                     skipped[act] = skipped.get(act, []) + [pred]


### Check accuracy for specific industries

In [None]:
from collections import Counter, defaultdict

test_industries = Counter([c['industry'] for c in tested])
company_accuracy = defaultdict(int)

for company, predict in zip(tested, predicted):
    if company['industry'] == predict:
        company_accuracy[company['industry']] += 1

for key, item in company_accuracy.items():
    company_accuracy[key] = round(item / test_industries[key] * 100, 2)

In [None]:
for company, predict in zip(tested, predicted):
    if company['industry'] != predict and company_accuracy[company['industry']] < 20:
        print("%s: predicted %s" % (company['industry'], predict))

In [None]:
industries = Counter([c['industry'] for c in companies])
x = []
y = []
for industry, common in industries.most_common():
    print("%s - %s" % (industry, company_accuracy[industry]))
    x.append(common)
    y.append(company_accuracy[industry])

In [None]:
%matplotlib inline

from matplotlib import pyplot as plt

plt.plot(x, y)

# Extract different features from websites

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.utils import shuffle
from ipyparallel import require


companies = shuffle(companies)
train_data = companies[:160000]
target = []

features = []

@require(sites_for_domain, get_soup_from_html_gz)
def get_features(path):
    image_count = 0
    sites = 0
    form_count = 0
    for site_path in sites_for_domain(path):
        sites += 1
        soup = get_soup_from_html_gz(site_path)
        image_count += len(soup.find_all('img'))
        form_count += len(soup.find_all('form'))
    avg_image_count = image_count / sites if sites else 0
    return [sites, image_count, avg_image_count, form_count]


results = []
for i, company in enumerate(tqdm(train_data)):
    path = company['path_to']
    ar = lview.apply_async(get_features, path)
    results.append(ar)

for i, company in enumerate(tqdm(train_data)):
    try:
        features.append(results[i].get())
        target.append(company)
    except Exception as e:
        print(e)

# Accuracy 25% for company_size
# 26% for balanced classifier


In [None]:
# Try SGD Classifier
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', n_iter=100, alpha=0.01, n_jobs=5)#, class_weight='balanced')
clf.fit(features, [c['company_size_clean'] for c in target])

In [None]:
texts = []
# Test on 1000 samples
test_companies = companies[-4000:]
results = []

features_test = []
target_test = []

for i, company in enumerate(tqdm(test_companies)):
    path = company['path_to']
    ar = lview.apply_async(get_features, path)
    results.append(ar)

for i, company in enumerate(tqdm(test_companies)):
    try:
        features_test.append(results[i].get())
        target_test.append(company)
    except Exception as e:
        print(e)

score = clf.score(features_test, [c['company_size_clean'] for c in target_test])

In [None]:
score

In [None]:
sizes = {
    '1': 1,
    '1-10': 2,
    '11-50': 3,
    '51-200': 4,
    '201-500': 5,
    '501-1000': 6,
    '1001-5000': 7,
    '5001-10,000': 8,
    '10,001+': 9,
}

In [None]:
# Score not so harshly
correct = 0
for i, com in enumerate(target_test):
    predict = sizes.get(clf.predict([features_test[i]])[0])
    actual = sizes.get(com['company_size_clean'])
    if abs(predict - actual) <= 2:
        correct += 1

correct/len(target_test)

In [None]:
# See the mistakes
for i, com in enumerate(target_test[:200]):
    predicted = clf.predict([features_test[i]])[0]
    actual = com['company_size_clean']
    if not predicted == actual:
        print("Predicted - Actual: %s - %s" % (predicted, actual))

In [None]:
# See the industries
from collections import Counter

industries = Counter([c['industry'] for c in companies])
industries.most_common()

In [None]:
for k in sorted(industries.most_common()):
    print(k[0])

In [None]:
predicted