In [14]:
%reload_ext autoreload
%autoreload 2

from airflow.hooks import S3Hook
s3_conn = S3Hook().get_conn()

from skills_ml.storage import FSStore, S3Store

from skills_ml.job_postings.common_schema import JobPostingCollectionFromS3, JobPostingCollectionSample
from skills_ml.job_postings.filtering import JobPostingFilterer

from skills_ml.algorithms.embedding.models import Word2VecModel
from skills_ml.algorithms.preprocessing import IterablePipeline
from skills_ml.algorithms.string_cleaners import nlp
from skills_ml.algorithms.occupation_classifiers.train import OccupationClassifierTrainer, create_training_set
from skills_ml.algorithms.occupation_classifiers import FullSOC

import random
from functools import partial

In [10]:
job_24k = JobPostingCollectionFromS3(s3_conn=s3_conn, s3_paths="open-skills-private/sampled_jobpostings/samples_24k_v1")
job_24k = list(job_24k)

[2018-08-06 16:21:25,327] {common_schema.py:139} INFO - Extracting job postings from key sampled_jobpostings/samples_24k_v1
[2018-08-06 16:21:25,614] {s3.py:86} INFO - 0 bytes transferred out of 91185578 total
[2018-08-06 16:22:24,745] {s3.py:86} INFO - 10133504 bytes transferred out of 91185578 total
[2018-08-06 16:23:20,504] {s3.py:86} INFO - 20267008 bytes transferred out of 91185578 total
[2018-08-06 16:24:26,725] {s3.py:86} INFO - 30400512 bytes transferred out of 91185578 total
[2018-08-06 16:25:30,899] {s3.py:86} INFO - 40534016 bytes transferred out of 91185578 total
[2018-08-06 16:28:21,668] {s3.py:86} INFO - 50667520 bytes transferred out of 91185578 total
[2018-08-06 16:28:49,675] {s3.py:86} INFO - 60801024 bytes transferred out of 91185578 total
[2018-08-06 16:29:41,369] {s3.py:86} INFO - 70934528 bytes transferred out of 91185578 total
[2018-08-06 16:30:13,232] {s3.py:86} INFO - 81068032 bytes transferred out of 91185578 total
[2018-08-06 16:30:30,690] {s3.py:86} INFO - 91

In [13]:
random.shuffle(job_24k)

train_data = job_24k[:19200]
test_data = job_24k[19200:]

In [None]:
w2v = Word2VecModel.load(storage=S3Store('open-skills-private/model_cache/embedding'), model_name='word2vec_2018-07-23T16:33:35.522443.model')


## Full-Soc

In [15]:
full_soc = FullSOC()

[2018-08-06 16:45:40,076] {onet.py:47} INFO - Processing Content Model Reference
[2018-08-06 16:45:43,463] {onet.py:51} INFO - Processing occupation data
[2018-08-06 16:45:45,972] {onet.py:69} INFO - Processing Knowledge, Skills, Abilities
[2018-08-06 16:46:45,842] {onet.py:82} INFO - Processing tools and technology


In [None]:
def filter1(doc):
    if full_soc.filter_func(doc):
        return doc
    else:
        return None

    
document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills']
pipe_x = IterablePipeline(
    filter1,
    partial(nlp.fields_join, document_schema_fields=document_schema_fields),
    nlp.clean_str,
    nlp.word_tokenize,
    partial(nlp.vectorize, embedding_model=w2v)
)
pipe_y = IterablePipeline(
    filter1, 
    full_soc.transformer
)

matrix = create_training_set(
    job_postings_generator=train_data,
    target_variable=full_soc,
    pipe_x=pipe_x,
    pipe_y=pipe_y
)



In [None]:
grid_config = {
                 'sklearn.ensemble.ExtraTreesClassifier': {
                     'n_estimators': [100, 500, 1000],
                     'criterion': ['gini', 'entropy'],
                     'max_depth': [1, 5, 10, 20, 50],
                     'max_features': ['sqrt', 'log2'],
                     'min_samples_split': [2, 5, 10]
                     },
                 'sklearn.ensemble.RandomForestClassifier': {
                     'n_estimators': [100, 500, 1000],
                     'criterion': ['gini', 'entropy'],
                     'max_depth': [1, 5, 10, 20, 50],
                     'max_features': ['sqrt', 'log2'],
                     'min_samples_split': [2, 5, 10]
                     }
                 }

In [None]:
trainer = OccupationClassifierTrainer(
    matrix=matrix,
    k_folds=3,
)
trainer.train()