# Import data

In [None]:
import json

with open('companies_cross.json', 'r') as f:
    companies = json.load(f)

In [None]:
from collections import Counter

# Let's use only companies that we currently have website of (at least some pages)
companies = [c for c in companies if c.get('path_to')]

# Let's also limit the number of sites (because this is not cleaned yet)
companies = [c for c in companies if c.get('current_site_count', 0) < 100]

# Let's use only well represented industries
industries = Counter([c['industry'] for c in companies])
companies = [c for c in companies if industries[c.get('industry')] > 500]

In [None]:
with open('similar_industries.json', 'r') as f:
    similar_industries = json.load(f)

### Some helper methods

In [None]:
import os

import logging

# set root logger level
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

# setup custom logger
logger = logging.getLogger(__name__)
handler = logging.FileHandler('extract_features.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)


def get_texts_for_domain(path, file_name):
    text_file = os.path.join(path, file_name)
    if not os.path.exists(text_file):
        text = None
        text_file = '/dev/null'
    else:
        with open(text_file, 'r') as f:
            text = f.read()
    return text, text_file


def get_full_text(path):
    return get_texts_for_domain(path, 'full_texts.txt')


def get_nav_text(path):
    return get_texts_for_domain(path, 'nav_menu.txt')


def get_meta_descriptions_text(path):
    return get_texts_for_domain(path, 'descriptions.txt')


def get_meta_keywords_text(path):
    return get_texts_for_domain(path, 'keywords.txt')


def get_titles_text(path):
    return get_texts_for_domain(path, 'titles.txt')


def get_metas_text(path):
    return get_texts_for_domain(path, 'metas.txt')

### Scoring functions

In [None]:
def score_similar(estimator, data, actual):
    predict = estimator.predict(data)
#     actual = [c['industry'] for c in actual]
    expected_count = Counter(actual)
    results = {}
    for num, example in enumerate(actual):
        if example == predict[num]:
            results[example] = results.get(example, 0) + 1
        elif predict[num] in similar_industries.get(example, []):
            results[example] = results.get(example, 0) + 0.5

    for key, value in results.items():
        results[key] = value / expected_count[key]

#     print(recall_score(expected, predicted, average='weighted'))
    return sum([v * expected_count[key] / len(actual) for key, v in results.items()])


def score_similar_single(estimator, data, actual):
    predict = estimator.predict(data)
#     actual = [c['industry'] for c in actual]
    expected_count = Counter(actual)
    results = {}
    for num, example in enumerate(actual):
        if example == predict[num]:
            results[example] = results.get(example, 0) + 1
        elif predict[num] in similar_industries.get(example, []):
            results[example] = results.get(example, 0) + 0.5

    for key, value in results.items():
        results[key] = value / expected_count[key]
    return results

sizes = {
    '1': 1,
    '1-10': 2,
    '11-50': 3,
    '51-200': 4,
    '201-500': 5,
    '501-1000': 6,
    '1001-5000': 7,
    '5001-10,000': 8,
    '10,001+': 9,
}

def scoring_neighbour(estimator, data, actual):
    predicted = estimator.predict(data)
    correct = 0
    for p, a in zip(predicted, actual):
        if abs(sizes.get(p) - sizes.get(a)) < 2:
            correct += 1
    return correct/len(actual)

# Classify

## Prepare pipeline

In [None]:
# Create transformers
from sklearn.preprocessing import FunctionTransformer
from tqdm import tqdm

def extract_from_company(companies, func):
    paths = []
    for company in tqdm(companies):
        text, path = func(company['path_to'])
        paths.append(path)
    return paths

def get_full_text_from_company(companies):
    return extract_from_company(companies, get_full_text)

def get_nav_menus_from_company(companies):
    return extract_from_company(companies, get_nav_text)

def get_descriptions_from_company(companies):
    return extract_from_company(companies, get_meta_descriptions_text)

def get_titles_from_company(companies):
    return extract_from_company(companies, get_titles_text)

def get_keywords_from_company(companies):
    return extract_from_company(companies, get_meta_keywords_text)

def get_metas_from_company(companies):
    return extract_from_company(companies, get_metas_text)


full_text_transformer = FunctionTransformer(get_full_text_from_company, validate=False)
nav_menus_transformer = FunctionTransformer(get_nav_menus_from_company, validate=False)
descriptions_transformer = FunctionTransformer(get_descriptions_from_company, validate=False)
titles_transformer = FunctionTransformer(get_titles_from_company, validate=False)
keywords_transformer = FunctionTransformer(get_keywords_from_company, validate=False)
metas_transformer = FunctionTransformer(get_metas_from_company, validate=False)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from numpy import concatenate

hv = HashingVectorizer(non_negative=True, input='filename')
tfv = TfidfVectorizer(input='filename', max_features=3000)
tf_transformer = TfidfTransformer()
# clf = SGDClassifier(loss='hinge', penalty='l2', n_iter=10, alpha=0.001, n_jobs=10)#, class_weight='balanced')
clf2 = SGDClassifier(loss='hinge', n_iter=10, alpha=0.001, n_jobs=10)#, class_weight='balanced')
lda = LatentDirichletAllocation(n_topics=100, learning_method='batch', max_iter=5, n_jobs=5, learning_offset=50.)
st_clf = SGDClassifier(loss='hinge', penalty='l2', n_iter=10, alpha=0.001, n_jobs=10)
st_clf2 = SGDClassifier(loss='log', penalty='l2', n_iter=10, alpha=0.001, n_jobs=10)
lsa = TruncatedSVD(n_components=100)

# pipeline_elements = [('HV', hv), ('tfid', tf_transformer), ('lsa', lsa), ('svc', clf)]
pipeline_elements = [('HV', hv), ('tfid', tf_transformer), ('svc', clf)]
pipe = Pipeline(pipeline_elements)

In [None]:
# We need to do this, to make our classifier return "something" with .predict_proba()
# The default version doesn't allow that for this specific classifier
# We need it for VotingClassifier, to enable voting='soft'
class CustomSGD(SGDClassifier):
    def predict_proba(self, X):
        return self.decision_function(X)

clf = CustomSGD(loss='hinge', penalty='l2', n_iter=10, alpha=0.001, n_jobs=10)

In [None]:
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier


pipe_full = Pipeline([('Full', full_text_transformer)] + pipeline_elements)
pipe_nav_menus = Pipeline([('Navs', nav_menus_transformer)] + pipeline_elements)
pipe_descriptions = Pipeline([('Descriptions', descriptions_transformer)] + pipeline_elements)
pipe_titles = Pipeline([('Titles', titles_transformer)] + pipeline_elements)
pipe_keywords = Pipeline([('Keywords', keywords_transformer)] + pipeline_elements)
pipe_meta = Pipeline([('Metas', metas_transformer)] + pipeline_elements)

identity = FunctionTransformer(None, validate=False)
lda_union = FeatureUnion([('lda', lda), ('identity', identity)])
pipe_lda = Pipeline([('Full', pipe_meta),
                     ('tfv', tfv),
                     ('lda_union', lda_union),
#                      ('lda', lda),
                     ('clf', clf2)
                    ])


# For some reason this is not working as it should. Explore why
stacking = StackingClassifier(classifiers=[pipe_full,
                                           pipe_meta],
                             meta_classifier=st_clf, use_probas=True, average_probas=False)

# Might be able to get better results with adding some other
# estimators. Find out with GridSeachCV. Try different weights too
voting = VotingClassifier(estimators=[('full', pipe_full),
#                                       ('navs', pipe_nav_menus),
#                                       ('desc', pipe_descriptions),
#                                       ('titles', pipe_titles),
#                                       ('keywords', pipe_keywords),
                                      ('meta', pipe_meta),
#                                       ('lda', pipe_lda)
                                     ],
                         voting='soft')

## Fit and test

In [None]:
# Run this cell, if you want to test only on english websites
companies_orig = companies

companies = [c for c in companies if c['website_lang'] == 'en']

In [None]:
# This is needed only for StackingClassifier (it throws error on non-int labels...)
target_int_map = {key: i for i, key in enumerate(industries.keys())}
inv_target_int_map = {i: ind for ind, i in target_int_map.items()}

In [None]:
from sklearn.utils import shuffle
from scipy.sparse import hstack


companies = shuffle(companies)

target = []
train_data = []

for company in tqdm(companies[:-10000]):
    text, file_name = get_full_text(company['path_to'])
    if text:
        train_data.append(company)
        industry = company['industry']
        target.append(industry)
#         target.append(company['company_size_clean'])

# int_target = [target_int_map[industry] for industry in target]

# Fit
# clf2.fit(new, target)

# pipe_lda_final.fit(train_data, target)
voting.fit(train_data, target)
# pipe_full.fit(train_data, target)

In [None]:
# Select the best in stacking
from sklearn.cross_validation import cross_val_score

classifiers = [pipe_full,
                pipe_meta]

for c in classifiers:
    score_stacking = cross_val_score(c, train_data, int_target, scoring=score_similar, n_jobs=5)
    print(score_stacking.mean())

score_stacking = cross_val_score(stacking, train_data, int_target, scoring=score_similar, n_jobs=5)
print("***** score_stacking *****")
print(score_stacking.mean())

# params = [
#     {'use_probas': [True, False]},
#     {'use_probas': [True], 'average_probas': [True, False]}
# ]

In [None]:
# Select the best in voting
from sklearn.grid_search import GridSearchCV

params = [{'voting': ['hard', 'soft']},
          {'voting': ['hard'],
          'weights': [[1, 1, 1, 1], [2, 1.5, 1.2, 1.2], [1, 1.2, 1.2, 1], [1.5, 1, 1, 1]]}]
params = params[0]

grid = GridSearchCV(voting, param_grid=params, scoring=score_similar)
grid.fit(train_data, target)

## Test time!

In [None]:
from sklearn.cross_validation import cross_val_score

score_meta = cross_val_score(pipe_meta, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_meta *****")
print(score_meta.mean())
# score_lda = cross_val_score(pipe_lda, train_data, target, scoring=score_similar, n_jobs=5)
# print("***** score_lda *****")
# print(score_lda.mean())


In [None]:
from sklearn.cross_validation import cross_val_score


score_full = cross_val_score(pipe_full, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_full *****")
print(score_full.mean())
score_nav = cross_val_score(pipe_nav_menus, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_nav *****")
print(score_nav.mean())
score_titles = cross_val_score(pipe_titles, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_titles *****")
print(score_titles.mean())
score_descriptions = cross_val_score(pipe_descriptions, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_descriptions *****")
print(score_descriptions.mean())
score_keywords = cross_val_score(pipe_keywords, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_keywords *****")
print(score_keywords.mean())

# params = {
#     'voting': ['hard'],
#     'weights': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0], [1.7, 1, 1, 1, 1], [1, 1, 0.5, 0.5, 0.5], [1, 0.5, 1, 1, 0.5]]
# }

# grid = GridSearchCV(voting, param_grid=params, scoring=score_similar)
# grid.fit(train_data, target)

In [None]:
from sklearn.cross_validation import cross_val_score


score_full = cross_val_score(voting, train_data, target, scoring=score_similar, n_jobs=5)
print("***** score_full *****")
print(score_full.mean())

# score_full = cross_val_score(pipe_full, train_data, target, scoring=scoring_neighbour, n_jobs=5)
# print("***** score_full *****")
# print(score_full.mean())
# score_nav = cross_val_score(pipe_nav_menus, train_data, target, scoring=scoring_neighbour, n_jobs=5)
# print("***** score_nav *****")
# print(score_nav.mean())
# score_titles = cross_val_score(pipe_titles, train_data, target, scoring=scoring_neighbour, n_jobs=5)
# print("***** score_titles *****")
# print(score_titles.mean())
# score_descriptions = cross_val_score(pipe_descriptions, train_data, target, scoring=scoring_neighbour, n_jobs=5)
# print("***** score_descriptions *****")
# print(score_descriptions.mean())
# score_keywords = cross_val_score(pipe_keywords, train_data, target, scoring=scoring_neighbour, n_jobs=5)
# print("***** score_keywords *****")
# print(score_keywords.mean())

In [None]:
# Select the best
from sklearn.grid_search import GridSearchCV

params = {
#     'svc__penalty': ['l1', 'l2'],
#     'svc__alpha': [0.001, 0.01, 0.1],
#     'svc__n_iter': [5, 10, 100],
#     'svc__class_weight': [None, 'balanced']
#     'HV__stop_words': [None, 'english']
#     'svc__loss': ['log'],
#     'svc__epsilon': [0.1, 5]
    'lsa__n_components': [90, 200]
         }

grid = GridSearchCV(pipe_meta, param_grid=params, scoring=score_similar, n_jobs=3)
grid.fit(train_data, target)

In [None]:
# Try getting something valuable out of LDA

lda_params = [
    {'tfv__max_features': [1000, 2000, 4000]},
#     {'lda_union__lda__n_topics': [10, 50, 90]}
    {'lda__n_topics': [10, 50, 90],
]

grid_lda = GridSearchCV(pipe_lda, param_grid=lda_params, scoring=score_similar)#, n_jobs=3)

grid_lda.fit(train_data[:70000], target[:70000])

In [None]:
lda_params ={
    'tfv__max_features': [10000],
#     'lda_union__lda__learning_offset': [10., 50.],
#     'clf__alpha': [0.01, 0.001]
}

grid_lda = GridSearchCV(pipe_lda, param_grid=lda_params, scoring=score_similar)#, n_jobs=3)

grid_lda.fit(train_data, target)

In [None]:
test_data = []

for company in tqdm(companies[-10000:]):
    text, file_name = get_full_text(company['path_to'])
    if text:
        test_data.append(company)

target_test = [c['industry'] for c in test_data]
# score_voting = voting.score(test_data, [c['industry'] for c in test_data])  # 39% -> 40% eng  44% adjusted
# score_full = pipe_full.score(test_data, [c['company_size_clean'] for c in test_data])  # 52.3% -> 53.8% eng
# nav_menus - 21%
# descriptions - 28%
# cutoff industries 500 - 750 - 2%
# added lda (4000, 90) - 26%

In [None]:
score_similar(voting, test_data, [c['industry'] for c in test_data])

### Try a dummy estimator too

In [None]:
from sklearn.dummy import DummyClassifier

# Always most frequent - 8.3%
# Based on freq - 4.2%
# Random 2.4%

dummy = DummyClassifier(strategy='stratified', constant='Marketing and Advertising')
dummy.fit(train_data, target)

## Evaluate

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score

expected = [c['industry'] for c in test_data]
predicted = pipe_full.predict(test_data)
# predicted = ['Not' if p > 0 else 'Food & Beverages' for p in predicted_prob]

print("Classification report for classifier:\n%s\n"
      % classification_report(expected, predicted))
print("Confusion matrix:\n%s" % confusion_matrix(expected, predicted))

In [None]:
results = score_similar_single(voting, test_data, target_test)

### See results based on the industry

In [None]:
for key, value in sorted(results.items(), key=lambda x: x[0], reverse=False):
    print("%s----%.1f---%d" % (key, value * 100, industries[key]))