## Note
This notebook has to be run from scraper django project in order to access correct models and data

## Data from database

In [None]:
from scraper_app.models import Companies

companies = Companies.objects.exclude(company_name="").exclude(website="").exclude(website__isnull=True).exclude(company_size="").exclude(company_size=None).exclude(industry='').exclude(founded__isnull=True).distinct('website')
values = companies.values('company_name', 'website', 'company_size', 'industry', 'founded', 'id')


## Data from file

In [None]:
import json

with open('companies.json', 'r') as f:
    values = json.load(f)

## Cleaning

### Bad URLs

In [None]:
from django.core.validators import URLValidator

url_validator = URLValidator()

clean_values = []

for value in values:
    try:
        url = value.get('website')
        url_validator(url)
        clean_values.append(value)
    except Exception as e:
        pass

values = clean_values

### Founded in the future

In [None]:
clean_values = [v for v in values if v['founded'] < 2017]
values = clean_values

### Company size values

In [None]:
for v in values:
    v['company_size_clean'] = v['company_size'].split(' ')[0]

size_map = {'Myself' : '1',
            'Just': '1',
            '1,001-5,000': '1001-5000',
            '501-1,000':'501-1000',
            '5,001-10,000': '5001-10,000',
            '10,001': '10,001+',
            '20-49\xa0Employees' : '11-50',
            '20-49' : '11-50',
           }

for v in values:
    v['company_size_clean'] = size_map.get(v['company_size_clean'], v['company_size_clean'])


## Data distribution

In [None]:
from collections import Counter

sizes = Counter(v['company_size_clean'] for v in values)
ages = Counter(v['founded'] for v in values)
industries = Counter(v['industry'] for v in values)

### Use only the most common

In [None]:
common_sizes = [s[0] for s in sizes.most_common(9)]
clean_values = [v for v in values if v['company_size_clean'] in common_sizes]
values = clean_values

In [None]:
# For now just use industries that have at least 10 examples.
# TODO the industries should be defined differently. This won't work be good enough probably
common_industries = [s[0] for s in industries.most_common() if s[1] > 10]
clean_values = [v for v in values if v['industry'] in common_industries]
values = clean_values

## Save values

In [None]:
import json
with open('companies.json', 'w') as f:
    json.dump(values, f)