In [2]:
import pandas as pd
import numpy as np
import re
import string
import os
import json
from bidict import bidict
import pickle
import random
from math import ceil
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
import itertools

In [37]:
with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "rb")as f:
    train_ids = pickle.load(f)
with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "rb")as f:
    test_ids = pickle.load(f)

In [25]:
def preprocess_job_title_sequences(data_path="/data/rali7/Tmp/solimanz/data/pickles/",
                                   save_path="/data/rali7/Tmp/solimanz/data/datasets/",
                                   offset=False, save=False):

    print('Reading test and train ids...')
    with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "rb")as f:
        train_ids = pickle.load(f)
    with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "rb")as f:
        test_ids = pickle.load(f)

    print('Loading dataframe...')
    data = pd.read_pickle(os.path.join(data_path, "clean_2017_11_28.pkl"))
    
    func_counts = data.transformed.value_counts()
    top_550 = func_counts[:550]    
    
    print('Building mapping between job title name and a job title id...')
    job_titles = top_550.index.values
    if offset:
        title_id = {title: i + 1 for i, title in enumerate(job_titles)}
    else:
        title_id = {title: i for i, title in enumerate(job_titles)}

    print('Getting list of job titles for every profile id')
    func_series = data.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))

    print('Building training data list...')
    train_data = [[title_id[title] for title in func_series[i]] for i in train_ids]
    print('Build test data...')
    test_data = [[title_id[title] for title in func_series[i]] for i in test_ids]

    print('dumping json...')
    data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data
    }

    if(save):
        with open(os.path.join(save_path, "title_seq.json"), 'w') as f:
            json.dump(data, f)
    else:
        return data

def build_data(dataset="train"):

    data = preprocess_job_title_sequences(offset=False, save=False)
    examples = data[dataset + "_data"]
    title_to_id = bidict(data["title_to_id"])
    vocab_size = len(title_to_id)
    targets = np.zeros(len(examples))
    X = np.zeros((len(examples), vocab_size))

    for i, ex in enumerate(examples):
        targets[i] = ex[-1]
        for elem in ex[:-1]:
            X[i][elem] += 1

    return X, targets, title_to_id

def multiomial_nb():

    X_train, train_targets, titles_to_id = build_data(type="train")
    X_test, test_targets, _ = build_data(type="test")

    # Train
    multi_nb = MultinomialNB()
    print("Training Multinomial Naive Bayes...")
    multi_nb.fit(X_train, train_targets)

    # Test
    print("Running trained model on test dataset")
    predicted = multi_nb.predict(X_test)
    acc = np.mean(predicted == test_targets)

    print("Model Accuracy: " + str(acc))

In [26]:
data = preprocess_job_title_sequences()

Reading test and train ids...
Loading dataframe...
Building mapping between job title name and a job title id...
Getting list of job titles for every profile id
Building training data list...
Build test data...
dumping json...


In [29]:
title_id = bidict(data["title_to_id"])
train = data["train_data"]

In [31]:
train_text = [[title_id.inv[i] for i in title_seq] for title_seq in train]

In [33]:
train_text = [" ".join(title_seq) for title_seq in train_text]

In [40]:
train_text = [s.replace("_", " ") for s in train_text]

In [41]:
train_text

['president managing partner real estate agent president',
 'server server human resources intern human resources assistant',
 'director of sales director of sales owner',
 'software developer developer software developer team lead senior software developer software engineer',
 'buyer buyer senior buyer buyer',
 'personal trainer security guard security guard security guard security guard security guard',
 'painter customer service representative customer service associate',
 'medical office assistant medical office assistant medical office assistant medical office assistant',
 'senior account manager articling student associate',
 'controller chief finance officer chief finance officer',
 'accountant accountant accountant',
 'mechanic owner owner',
 'purchasing manager marketing manager director of marketing',
 'president service manager sales representative sales representative sales representative',
 'customer service representative customer service representative customer service r