In [1]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
from airflow.hooks import S3Hook
s3_conn = S3Hook().get_conn()
from skills_ml.datasets.job_postings import job_postings, job_postings_chain
import random
import json
from skills_ml.algorithms.corpus_creators.basic import Doc2VecGensimCorpusCreator, CorpusCreator
from collections import Counter
import numpy as np
from skills_ml.algorithms.sampling.jobs import JobSampler

[2017-11-03 13:19:06,459] {__init__.py:36} INFO - Using executor SequentialExecutor
[2017-11-03 13:19:07,693] {textcleaner.py:20} INFO - 'pattern' package not found; tag filters are not available for English


# Streaming Data to Create Corpus

## Simple Corpus

In [2]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator)

In [3]:
corpus = list(corpus)

[2017-11-03 13:19:08,681] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:19:08,778] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:19:08,780] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:19:08,934] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:19:09,483] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:19:09,663] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:19:09,824] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:19:09,952] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:19:10,083] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:19:10,222] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:19:10,357] {s

  ' Beautiful Soup.' % markup)


[2017-11-03 13:19:22,661] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/NLX_b8384025-fa09-417d-ae10-96880fac86be
[2017-11-03 13:19:23,863] {s3.py:83} INFO - 0 bytes transferred out of 20171705 total
[2017-11-03 13:19:26,704] {s3.py:83} INFO - 2244608 bytes transferred out of 20171705 total
[2017-11-03 13:19:27,317] {s3.py:83} INFO - 4489216 bytes transferred out of 20171705 total
[2017-11-03 13:19:27,665] {s3.py:83} INFO - 6733824 bytes transferred out of 20171705 total
[2017-11-03 13:19:27,927] {s3.py:83} INFO - 8978432 bytes transferred out of 20171705 total
[2017-11-03 13:19:28,154] {s3.py:83} INFO - 11223040 bytes transferred out of 20171705 total
[2017-11-03 13:19:28,369] {s3.py:83} INFO - 13467648 bytes transferred out of 20171705 total
[2017-11-03 13:19:28,568] {s3.py:83} INFO - 15712256 bytes transferred out of 20171705 total
[2017-11-03 13:19:28,749] {s3.py:83} INFO - 17956864 bytes transferred out of 20171705 total
[2017-11-03 13:19:28,927] {

In [4]:
len(corpus)

10567

In [5]:
corpus[0]

{'@context': 'http://schema.org',
 '@type': 'JobPosting',
 'alternateName': 'Payroll Specialist',
 'baseSalary': {'@type': 'MonetaryAmount', 'maxValue': 0.0, 'minValue': 0.0},
 'datePosted': '2012-06-20',
 'description': 'Position Summary: The Payroll Specialist is responsible for processing all payroll information for accurate and timely payroll distribution. Responsibilities: Process store-level biweekly payroll accurately and timely Import store employee info and time clock data Research and resolve employee payroll issues Process involuntary wage withholding correspondence Respond promptly to all requests and communications received Review and process adjustments to hours/earnings Maintain employee direct deposit records Assist with various projects as assigned GREAT OPPORTUNITY!',
 'educationRequirements': 'Graduate Degree',
 'employmentType': 'Full-Time',
 'experienceRequirements': 'Qualifications: 1-3 years of general payroll experience Positive attitude combined with excellent 

## Creating Corpus with Criteria

#### One can define their own funciton and logic of filtering based on the common schema.

In [6]:
def major_group_filter_func(document):
    if document['onet_soc_code']:
        if document['onet_soc_code'][:2] in ['11', '13']:
            return document

def full_soc_code_filter_func(document):
    if document['onet_soc_code']:
        if document['onet_soc_code'] in ['11-9051.00', '17-3026.00']:
            return document

def wage_filter_func(document):
    if document['baseSalary']['minValue']:
        if float(document['baseSalary']['minValue']) >= 60000.0:
            return document

### Filtered by Major Groups

In [7]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator, filter_func=major_group_filter_func)

In [8]:
corpus = list(corpus)

[2017-11-03 13:19:50,263] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:19:50,365] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:19:50,367] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:19:50,685] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:19:51,255] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:19:51,467] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:19:51,621] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:19:51,761] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:19:51,944] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:19:52,079] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:19:52,278] {s

  ' Beautiful Soup.' % markup)


[2017-11-03 13:19:53,053] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/NLX_b8384025-fa09-417d-ae10-96880fac86be
[2017-11-03 13:19:53,179] {s3.py:83} INFO - 0 bytes transferred out of 20171705 total
[2017-11-03 13:19:53,410] {s3.py:83} INFO - 2244608 bytes transferred out of 20171705 total
[2017-11-03 13:19:53,575] {s3.py:83} INFO - 4489216 bytes transferred out of 20171705 total
[2017-11-03 13:19:53,753] {s3.py:83} INFO - 6733824 bytes transferred out of 20171705 total
[2017-11-03 13:19:53,900] {s3.py:83} INFO - 8978432 bytes transferred out of 20171705 total
[2017-11-03 13:19:54,073] {s3.py:83} INFO - 11223040 bytes transferred out of 20171705 total
[2017-11-03 13:19:54,212] {s3.py:83} INFO - 13467648 bytes transferred out of 20171705 total
[2017-11-03 13:19:54,364] {s3.py:83} INFO - 15712256 bytes transferred out of 20171705 total
[2017-11-03 13:19:54,516] {s3.py:83} INFO - 17956864 bytes transferred out of 20171705 total
[2017-11-03 13:19:54,663] {

In [9]:
corpus[0]

{'@context': 'http://schema.org',
 '@type': 'JobPosting',
 'alternateName': 'General Manager',
 'baseSalary': {'@type': 'MonetaryAmount', 'maxValue': 0.0, 'minValue': 0.0},
 'datePosted': '2012-09-17',
 'description': 'If you are a personable, outgoing and service-oriented professional who wants to manage a dynamic office, come join the Kool Smiles family! Kool Smiles was founded in 2002 and was built on the belief that every family has the right to quality dental care in a clean, safe and fun environment. Our first goal is to provide quality and compliant dental care to individuals in communities that typically get overlooked by other dentists. Our services aren’t just for kids- we know that children are more likely to have great dental habits if mom and dad do too, so we happily care for the entire family. Kool Smiles is looking for a customer-service-oriented leader to fill the role of Office Manager. As the Office Manager, you will manage an office staff of 15 to 30 employees and p

In [10]:
major_group = list(map(lambda c: c['onet_soc_code'][:2], corpus))

In [11]:
Counter(major_group)

Counter({'11': 283, '13': 102})

### Filtered by Full O*NET SOC Code

In [12]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator, filter_func=full_soc_code_filter_func)

In [13]:
corpus = list(corpus)

[2017-11-03 13:19:58,112] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:19:58,370] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:19:58,371] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:19:58,484] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:19:59,604] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:19:59,880] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:20:00,071] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:20:00,213] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:20:00,343] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:20:00,478] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:20:00,614] {s

  ' Beautiful Soup.' % markup)


[2017-11-03 13:20:02,138] {s3.py:83} INFO - 2244608 bytes transferred out of 20171705 total
[2017-11-03 13:20:02,342] {s3.py:83} INFO - 4489216 bytes transferred out of 20171705 total
[2017-11-03 13:20:02,532] {s3.py:83} INFO - 6733824 bytes transferred out of 20171705 total
[2017-11-03 13:20:02,716] {s3.py:83} INFO - 8978432 bytes transferred out of 20171705 total
[2017-11-03 13:20:02,934] {s3.py:83} INFO - 11223040 bytes transferred out of 20171705 total
[2017-11-03 13:20:03,105] {s3.py:83} INFO - 13467648 bytes transferred out of 20171705 total
[2017-11-03 13:20:03,321] {s3.py:83} INFO - 15712256 bytes transferred out of 20171705 total
[2017-11-03 13:20:03,486] {s3.py:83} INFO - 17956864 bytes transferred out of 20171705 total
[2017-11-03 13:20:03,651] {s3.py:83} INFO - 20171705 bytes transferred out of 20171705 total
[2017-11-03 13:20:04,056] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/VA_c3b9dad2-ef6c-4439-a4e7-6dff0a55d362
[2017-11-03 13:20:04,

In [14]:
soc = list(map(lambda c: c['onet_soc_code'], corpus))

In [15]:
Counter(soc)

Counter({'11-9051.00': 79, '17-3026.00': 65})

### Filtered by Minimum Wage >= 60000

In [16]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator, filter_func=wage_filter_func)

In [17]:
corpus = list(corpus)

[2017-11-03 13:20:04,378] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:20:04,540] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:20:04,542] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:20:04,687] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:20:04,855] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:20:05,018] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:20:05,177] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:20:05,311] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:20:05,459] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:20:05,599] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:20:05,763] {s

In [18]:
min_wage = list(map(lambda c: c['baseSalary']['minValue'], corpus))

In [19]:
Counter(min_wage)

Counter({60000.0: 8,
         65000.0: 10,
         70000.0: 4,
         71110: 1,
         75000.0: 5,
         80000.0: 10,
         83221: 1,
         85000.0: 1,
         90000.0: 12,
         100000.0: 2,
         120000.0: 3,
         160000.0: 1})

# Sampling from Corpus

## Sampling from simple corpus

In [20]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator)

In [21]:
from skills_ml.algorithms.sampling.jobs import JobSampler
job_sampler = JobSampler(corpus, random_state=42)
corpus = job_sampler.sample(100)

[2017-11-03 13:20:08,536] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:20:08,603] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:20:08,604] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:20:08,701] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:20:09,050] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:20:09,225] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:20:09,396] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:20:09,563] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:20:09,714] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:20:09,875] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:20:10,021] {s

  ' Beautiful Soup.' % markup)


[2017-11-03 13:20:22,493] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/NLX_b8384025-fa09-417d-ae10-96880fac86be
[2017-11-03 13:20:23,938] {s3.py:83} INFO - 0 bytes transferred out of 20171705 total
[2017-11-03 13:20:24,535] {s3.py:83} INFO - 2244608 bytes transferred out of 20171705 total
[2017-11-03 13:20:24,724] {s3.py:83} INFO - 4489216 bytes transferred out of 20171705 total
[2017-11-03 13:20:24,890] {s3.py:83} INFO - 6733824 bytes transferred out of 20171705 total
[2017-11-03 13:20:25,044] {s3.py:83} INFO - 8978432 bytes transferred out of 20171705 total
[2017-11-03 13:20:25,206] {s3.py:83} INFO - 11223040 bytes transferred out of 20171705 total
[2017-11-03 13:20:25,357] {s3.py:83} INFO - 13467648 bytes transferred out of 20171705 total
[2017-11-03 13:20:25,510] {s3.py:83} INFO - 15712256 bytes transferred out of 20171705 total
[2017-11-03 13:20:25,688] {s3.py:83} INFO - 17956864 bytes transferred out of 20171705 total
[2017-11-03 13:20:25,850] {

In [22]:
corpus[0]

({'@context': 'http://schema.org',
  '@type': 'JobPosting',
  'alternateName': 'Customer Service Representative',
  'baseSalary': {'@type': 'MonetaryAmount',
   'maxValue': 11.0,
   'minValue': 11.0},
  'datePosted': '2011-08-25',
  'description': 'We are currently looking for bilingual (Spanish- English) customer service representative. You need to be fluent in both languages, clerical experience, be customer service oriented and have a professional appearance. If you think you have great customer service skills, have a strong work ethic and want to be part of a great team please contact me',
  'educationRequirements': 'Not Specified',
  'employmentType': 'Full-Time',
  'experienceRequirements': 'MINIMUM QUALIFICATIONS: High school diploma and some college coursework or equivalent Capable of solving a variety of customer service problems and supporting a range of personalities and customer types Bi-lingual: Spanish Ability to communicate effectively with others, orally and written',
 

In [23]:
industry = list(map(lambda c: c[0]['industry'], corpus))

In [24]:
Counter(industry)

Counter({'': 59,
         'Advertising, Sales - Marketing, Hospitality': 1,
         'Biotechnology, Pharmaceutical, Food': 1,
         'Consulting, Sales - Marketing, Hospitality': 1,
         'Consumer Products, Other, Food': 1,
         'Consumer Products, Sales - Marketing, Food': 2,
         'Education - Teaching - Administration, Social Services, Food': 1,
         'Entertainment, Food, Hospitality': 1,
         'Entertainment, Retail, Hospitality': 1,
         'Hospitality, Restaurant, Food': 4,
         'Not for Profit - Charitable, Healthcare - Health Services, Food': 1,
         'Packaging, Manufacturing, Food': 1,
         'Public Relations, Not for Profit - Charitable, Food': 1,
         'Restaurant, Food, Hospitality': 7,
         'Restaurant, Full Service Restaurant, Hospitality': 1,
         'Restaurant, Managed Care, Hospitality': 1,
         'Restaurant, Retail, Hospitality': 6,
         'Retail, Restaurant, Food': 1,
         'Sales - Marketing, Consulting, Hospitalit

## Sampling from filtered corpus

### Reservoir Sampling

In [25]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator, filter_func=major_group_filter_func)

In [26]:
job_sampler = JobSampler(corpus, random_state=42)
corpus = job_sampler.sample(100)

[2017-11-03 13:20:44,628] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:20:44,723] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:20:44,724] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:20:44,812] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:20:45,782] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:20:48,578] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:20:50,924] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:20:54,693] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:20:57,952] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:21:00,905] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:21:04,599] {s

  ' Beautiful Soup.' % markup)


[2017-11-03 13:21:13,010] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/NLX_b8384025-fa09-417d-ae10-96880fac86be
[2017-11-03 13:21:13,079] {s3.py:83} INFO - 0 bytes transferred out of 20171705 total
[2017-11-03 13:21:17,645] {s3.py:83} INFO - 2244608 bytes transferred out of 20171705 total
[2017-11-03 13:21:19,741] {s3.py:83} INFO - 4489216 bytes transferred out of 20171705 total
[2017-11-03 13:21:21,937] {s3.py:83} INFO - 6733824 bytes transferred out of 20171705 total
[2017-11-03 13:21:24,104] {s3.py:83} INFO - 8978432 bytes transferred out of 20171705 total
[2017-11-03 13:21:25,724] {s3.py:83} INFO - 11223040 bytes transferred out of 20171705 total
[2017-11-03 13:21:26,244] {s3.py:83} INFO - 13467648 bytes transferred out of 20171705 total
[2017-11-03 13:21:26,619] {s3.py:83} INFO - 15712256 bytes transferred out of 20171705 total
[2017-11-03 13:21:26,904] {s3.py:83} INFO - 17956864 bytes transferred out of 20171705 total
[2017-11-03 13:21:27,186] {

In [27]:
onet_soc_code = list(map(lambda c: c[0]['onet_soc_code'][:2], corpus))

In [28]:
Counter(onet_soc_code)

Counter({'11': 78, '13': 22})

### Weighted Reservoir Sampling

In [29]:
job_postings_generator = job_postings_chain(s3_conn, ['2011Q2'], 'open-skills-private/test_corpus')
corpus = CorpusCreator(job_postings_generator, filter_func=major_group_filter_func)

In [30]:
job_sampler = JobSampler(corpus, major_group=True,weights={'11': 1, '13': 3.5})
sampled_corpus = job_sampler.sample(50)

[2017-11-03 13:21:28,695] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/
[2017-11-03 13:21:28,789] {s3.py:83} INFO - 0 bytes transferred out of 0 total
[2017-11-03 13:21:28,791] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/CB_6874581a-5942-48fd-ba7e-2b1cdc38563f
[2017-11-03 13:21:28,910] {s3.py:83} INFO - 0 bytes transferred out of 18588132 total
[2017-11-03 13:21:29,344] {s3.py:83} INFO - 2072576 bytes transferred out of 18588132 total
[2017-11-03 13:21:29,599] {s3.py:83} INFO - 4145152 bytes transferred out of 18588132 total
[2017-11-03 13:21:29,851] {s3.py:83} INFO - 6217728 bytes transferred out of 18588132 total
[2017-11-03 13:21:30,126] {s3.py:83} INFO - 8290304 bytes transferred out of 18588132 total
[2017-11-03 13:21:30,395] {s3.py:83} INFO - 10362880 bytes transferred out of 18588132 total
[2017-11-03 13:21:30,644] {s3.py:83} INFO - 12435456 bytes transferred out of 18588132 total
[2017-11-03 13:21:30,867] {s

  ' Beautiful Soup.' % markup)


[2017-11-03 13:21:31,887] {job_postings.py:36} INFO - Extracting job postings from key test_corpus/2011Q2/NLX_b8384025-fa09-417d-ae10-96880fac86be
[2017-11-03 13:21:31,987] {s3.py:83} INFO - 0 bytes transferred out of 20171705 total
[2017-11-03 13:21:32,365] {s3.py:83} INFO - 2244608 bytes transferred out of 20171705 total
[2017-11-03 13:21:32,632] {s3.py:83} INFO - 4489216 bytes transferred out of 20171705 total
[2017-11-03 13:21:32,914] {s3.py:83} INFO - 6733824 bytes transferred out of 20171705 total
[2017-11-03 13:21:33,219] {s3.py:83} INFO - 8978432 bytes transferred out of 20171705 total
[2017-11-03 13:21:33,533] {s3.py:83} INFO - 11223040 bytes transferred out of 20171705 total
[2017-11-03 13:21:33,813] {s3.py:83} INFO - 13467648 bytes transferred out of 20171705 total
[2017-11-03 13:21:34,056] {s3.py:83} INFO - 15712256 bytes transferred out of 20171705 total
[2017-11-03 13:21:34,360] {s3.py:83} INFO - 17956864 bytes transferred out of 20171705 total
[2017-11-03 13:21:34,650] {

In [31]:
major_group = list(map(lambda c: c[1][:2], sampled_corpus))

In [32]:
Counter(major_group)

Counter({'11': 23, '13': 27})

In [33]:
sampled_corpus[0]

({'@context': 'http://schema.org',
  '@type': 'JobPosting',
  'baseSalary': {'@type': 'MonetaryAmount', 'maxValue': '', 'minValue': ''},
  'datePosted': '2011-01-19',
  'description': 'Regional Staff Accountant Tracking Code 6466 Job Description Make your mark in Broadcasting and Digital Media. Sinclair Broadcast Group and Sinclair Digital Solutions are dedicated to making Sinclair a communications powerhouse! We are the largest and most diversified television broadcasting company in the country today. Sinclair owns and operates, programs or provides services to 162 stations located in 79 geographically diverse markets and our Digital group is focused on bringing the most engaging content to web, mobile and over-the-top broadcasting to audiences all over the country! Our success is the result of extraordinary employees and an exemplary management team who believe in a vision and are dedicated ensuring a great future for our employees. Whether you are an industry veteran or a just start