# Features
There are many features that we want to extract from the websites:

* Size (total, text, css, js) - we have to ignore css and js for now, since we don't have them
* Number of images
* Number of forms
* Number of mailto links
* Bag of words (the whole website)
* Only specific words (based on tags)
* Number of items in navigation menu
* Navigation menu
* Meta description
* Meta keywords

Optionally, we might want to extend this to:

* Topics (that we extract)
* Genres of the sites (extracted as well)
* Check for iframes, Bootstrap, React, etc.
* Check for Google Analytics
* Check for Facebook/Twitter/Google+ meta elements
* How much are HTML5 and CSS3 elements used?

# Load dataset

In [1]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [9]:
# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Get bag of words for websites

In [30]:
import os
import gzip

from scandir import walk
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.feature_extraction.text import HashingVectorizer


# Some helper functions
def get_soup_from_html_gz(path):
    with gzip.open(path, 'rt') as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup


texts = []
hv = HashingVectorizer()
for company in tqdm(companies[:1000]):
    text = ''
    path = company['path_to']
    for dirpath, dirs, file_names in walk(path):
        for file_name in file_names:
            if file_name.endswith('.gz'):
                text += get_soup_from_html_gz(os.path.join(dirpath, file_name)).get_text()
                text += '\n'
    
    # NOTE: It might be better to first extract text, save it in file, then pass list of paths to HashingVectorizer
    texts.append(text)
bag = hv.transform(texts)
del texts


100%|██████████| 1000/1000 [03:24<00:00,  4.89it/s]


In [31]:
bag

<1000x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 1218460 stored elements in Compressed Sparse Row format>

# Extract different features from websites

In [None]:
import os
import gzip

from scandir import walk
from bs4 import BeautifulSoup
from tqdm import tqdm


# Some helper functions
def get_soup_from_html_gz(path):
    with gzip.open(path, 'rt') as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup


image_counts = []
hv = HashingVectorizer()
for company in tqdm(companies[:100]):
    image_count = 0
    sites = 0
    path = company['path_to']
    for dirpath, dirs, file_names in walk(path):
        for file_name in file_names:
            if file_name.endswith('.gz'):
                sites += 1
                image_count += len(get_soup_from_html_gz(os.path.join(dirpath, file_name)).find_all('form'))                
    image_counts.append(image_count / sites if sites else 0)



 20%|██        | 20/100 [00:02<00:10,  7.91it/s]

In [37]:
image_counts

[51.81818181818182,
 1.5,
 8.571428571428571,
 9.0,
 4.857142857142857,
 0.0,
 9.0,
 8.25,
 5.5,
 8.333333333333334,
 11.11111111111111,
 0,
 9.5,
 19.545454545454547,
 8.0,
 8.0,
 2.0,
 1.0,
 7.0,
 6.2,
 2.0,
 3.3333333333333335,
 28.72222222222222,
 2.3846153846153846,
 1.0,
 1.8571428571428572,
 7.666666666666667,
 0,
 14.333333333333334,
 23.0,
 13.0,
 3.3333333333333335,
 3.0,
 11.678571428571429,
 12.1,
 4.0,
 12.666666666666666,
 6.25,
 27.6,
 34.875,
 19.571428571428573,
 11.714285714285714,
 0.0,
 12.7,
 13.0,
 0,
 7.875,
 19.88888888888889,
 19.0,
 4.0,
 5.0,
 5.5,
 11.818181818181818,
 7.75,
 7.8,
 8.0,
 12.5,
 3.3333333333333335,
 2.2,
 14.166666666666666,
 3.1818181818181817,
 5.142857142857143,
 3.625,
 69.33333333333333,
 9.0,
 13.0,
 5.428571428571429,
 0,
 15.142857142857142,
 10.0,
 16.333333333333332,
 17.375,
 2.3333333333333335,
 0,
 9.090909090909092,
 10.583333333333334,
 5.181818181818182,
 22.142857142857142,
 3.2857142857142856,
 0.0,
 34.111111111111114,
 5.0