# Import data

In [None]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [None]:
from collections import Counter

# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Let's also limit the number of sites (because this is not cleaned yet)
companies = [c for c in companies if c.get('current_site_count', 0) < 100]

# Let's use only well represented industries
industries = Counter([c['industry'] for c in companies])
companies = [c for c in companies if industries[c.get('industry')] > 500]

### Some helper methods

In [None]:
import os

import logging

# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('extract_features.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)


def get_texts_for_domain(path, file_name):
    text_file = os.path.join(path, file_name)
    if not os.path.exists(text_file) or force_read:
        text = None
        text_file = '/dev/null'
    else:
        with open(text_file, 'r') as f:
            text = f.read()
    return text, text_file


def get_full_text(path):
    return get_texts_for_domain(path, 'full_texts.txt')


def get_nav_text(path):
    return get_texts_for_domain(path, 'nav_menu.txt')


def get_meta_descriptions_text(path):
    return get_texts_for_domain(path, 'descriptions.txt')


def get_meta_keywords_text(path):
    return get_texts_for_domain(path, 'keywords.txt')


def get_titles_text(path):
    return get_texts_for_domain(path, 'titles.txt')

# Classify

## Prepare pipeline

In [None]:
# Create transformers
from sklearn.preprocessing import FunctionTransformer
from tqdm import tqdm

def extract_from_company(companies, func):
    paths = []
    for company in tqdm(companies):
        text, file_path = func(company['path_to'])
        paths.append(path)
    return paths

def get_full_text_from_company(companies):
    return extract_from_company(companies, get_full_text)

def get_nav_menus_from_company(companies):
    return extract_from_company(companies, get_nav_text)

def get_descriptions_from_company(companies):
    return extract_from_company(companies, get_meta_descriptions_text)

def get_titles_from_company(companies):
    return extract_from_company(companies, get_titles_text)


full_text_transformer = FunctionTransformer(get_full_text_from_company, validate=False)
nav_menus_transformer = FunctionTransformer(get_nav_menus_from_company, validate=False)
descriptions_transformer = FunctionTransformer(get_descriptions_from_company, validate=False)
titles_transformer = FunctionTransformer(get_titles_from_company, validate=False)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

hv = HashingVectorizer(non_negative=True, input='filename')
tf_transformer = TfidfTransformer()
clf = SGDClassifier(loss='hinge', n_iter=100, alpha=0.01, n_jobs=10)

pipeline_elements = [('HV', hv), ('tfid', tf_transformer), ('svc', clf)]
pipe = Pipeline(pipeline_elements)

In [None]:
from sklearn.ensemble import VotingClassifier

pipe_full = Pipeline([('Full', full_text_transformer)] + pipeline_elements)
pipe_nav_menus = Pipeline([('Navs', nav_menus_transformer)] + pipeline_elements)
pipe_descriptions = Pipeline([('Descriptions', descriptions_transformer)] + pipeline_elements)
pipe_titles = Pipeline([('Titles', titles_transformer)] + pipeline_elements)

voting = VotingClassifier(estimators=[('full', pipe_full),
                                      ('navs', pipe_nav_menus),
                                      ('desc', pipe_descriptions),
                                      ('titles', pipe_titles)],
                         voting='hard')

## Fit and test

In [None]:
from sklearn.utils import shuffle

companies = shuffle(companies)
train_data = companies[:100000]

target = [company['industry'] for company in train_data]

voting.fit(train_data, target)

In [None]:
test_data = companies[-8000:]

score = voting.score(test_data, [c['industry'] for c in test_data])

## Evaluate

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score

print("Classification report for classifier:\n%s\n"
      % classification_report(expected, predicted))
print("Confusion matrix:\n%s" % confusion_matrix(expected, predicted))

In [None]:
expected_count = Counter(expected)
results = {}
for num, example in enumerate(expected):
    if example == predicted[num]:
        results[example] = results.get(example, 0) + 1
    elif predicted[num] in similar_industries.get(example, []):
        results[example] = results.get(example, 0) + 0.5

for key, value in results.items():
    results[key] = value / expected_count[key]

print(recall_score(expected, predicted, average='weighted'))
sum([v * expected_count[key] / len(expected) for key, v in results.items()])

### Check accuracy for specific industries

In [None]:
from collections import Counter, defaultdict

test_industries = Counter([c['industry'] for c in tested])
company_accuracy = defaultdict(int)

for company, predict in zip(tested, predicted):
    if company['industry'] == predict:
        company_accuracy[company['industry']] += 1

for key, item in company_accuracy.items():
    company_accuracy[key] = round(item / test_industries[key] * 100, 2)