In [1]:
import pandas as pd
import numpy as np
import re
import string
import os
import json
from bidict import bidict
import pickle
import random
from math import ceil
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
import itertools

In [2]:
with open("/data/rali7/Tmp/solimanz/data/datasets/top7000/train_ids.pkl", "rb")as f:
    train_ids = pickle.load(f)
with open("/data/rali7/Tmp/solimanz/data/datasets/top7000/test_ids.pkl", "rb")as f:
    test_ids = pickle.load(f)

In [3]:
def preprocess_job_title_sequences(data_path="/data/rali7/Tmp/solimanz/data/pickles/",
                                   save_path="/data/rali7/Tmp/solimanz/data/datasets/",
                                   offset=False, save=False):

    print('Reading test and train ids...')
    with open("/data/rali7/Tmp/solimanz/data/datasets/train_ids.pkl", "rb")as f:
        train_ids = pickle.load(f)
    with open("/data/rali7/Tmp/solimanz/data/datasets/test_ids.pkl", "rb")as f:
        test_ids = pickle.load(f)

    print('Loading dataframe...')
    data = pd.read_pickle(os.path.join(data_path, "clean_2017_11_28.pkl"))
    
    func_counts = data.transformed.value_counts()
    top_550 = func_counts[:550]    
    
    print('Building mapping between job title name and a job title id...')
    job_titles = top_550.index.values
    if offset:
        title_id = {title: i + 1 for i, title in enumerate(job_titles)}
    else:
        title_id = {title: i for i, title in enumerate(job_titles)}

    print('Getting list of job titles for every profile id')
    func_series = data.groupby('_id')['transformed'].apply(lambda x: list(reversed(list(x))))

    print('Building training data list...')
    train_data = [[title_id[title] for title in func_series[i]] for i in train_ids]
    print('Build test data...')
    test_data = [[title_id[title] for title in func_series[i]] for i in test_ids]

    print('dumping json...')
    data = {
        'title_to_id': title_id,
        'train_data': train_data,
        'test_data': test_data
    }

    if(save):
        with open(os.path.join(save_path, "title_seq.json"), 'w') as f:
            json.dump(data, f)
    else:
        return data

def build_data(dataset="train"):

    data = preprocess_job_title_sequences(offset=False, save=False)
    examples = data[dataset + "_data"]
    title_to_id = bidict(data["title_to_id"])
    vocab_size = len(title_to_id)
    targets = np.zeros(len(examples))
    X = np.zeros((len(examples), vocab_size))

    for i, ex in enumerate(examples):
        targets[i] = ex[-1]
        for elem in ex[:-1]:
            X[i][elem] += 1

    return X, targets, title_to_id

def multiomial_nb(X_train, train_targets, X_test, test_targets):
    # Train
    multi_nb = MultinomialNB()
    print("Training Multinomial Naive Bayes...")
    multi_nb.fit(X_train, train_targets)

    # Test
    print("Running trained model on test dataset")
    predicted = multi_nb.predict(X_test)
    acc = np.mean(predicted == test_targets)

    print("Model Accuracy: " + str(acc))
    
def bernoulli_nb(X_train, train_targets, X_test, test_targets):
    # Train
    nb = BernoulliNB()
    print("Training Multinomial Naive Bayes...")
    nb.fit(X_train, train_targets)

    # Test
    print("Running trained model on test dataset")
    predicted = multi_nb.predict(X_test)
    acc = np.mean(predicted == test_targets)

    print("Model Accuracy: " + str(acc))

In [14]:
X_test.shape

(167582, 1838)

In [15]:
X_train.shape

(670328, 1839)

In [21]:
def make_X(sequences, cv, counts=False, text=False):
    if text:
        #cv = CountVectorizer(binary=counts)
        X = cv.transform(sequences)
    else:
        vocab_size = 550
        X = np.zeros((len(sequences), vocab_size), dtype=np.int16)
        for i, seq in enumerate(sequences):
            for elem in seq:
                if counts:
                    X[i][elem] += 1
                else:
                    X[i][elem] = 1

    return X

In [5]:
ds1_path = "/data/rali7/Tmp/solimanz/data/datasets/top7000/1/"
ds1_file_name = "title_sequences"   
with open(os.path.join(ds1_path, f"{ds1_file_name}.json"), 'r') as f:
    data = json.load(f)

In [6]:
title_id = bidict(data["title_to_id"])
train = data["train_data"]
test = data["test_data"]

In [7]:
train_seq = [[title_id.inv[i] for i in title_seq[:-1]] for title_seq in train]
train_targets = [seq[-1] for seq in train]
train_text = [" ".join(title_seq).replace("_", " ") for title_seq in train_seq]

test_seq = [[title_id.inv[i] for i in title_seq[:-1]] for title_seq in test]
test_targets = [seq[-1] for seq in test]
test_text = [" ".join(title_seq).replace("_", " ") for title_seq in test_seq]

In [17]:
all_t = train_text + test_text

In [22]:
cv = CountVectorizer(binary=False)
cv.fit(all_t)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [23]:
X_train = make_X(train_text, cv, counts=True, text=True) 
X_test = make_X(test_text, cv, counts=True, text=True)

In [24]:
multi_nb = MultinomialNB()
multi_nb.fit(X_train, train_targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
nb = BernoulliNB()
nb.fit(X_train, train_targets)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [26]:
def top_k_acc(model, k=1):
    preds = model.predict_proba(X_test)
    sorted_args = (-preds).argsort(axis=1)[:,:k]
    tt = np.tile(test_targets, (k,1)).T
    acc = np.mean(np.sum(sorted_args == tt, axis=1))
    return acc

In [None]:
print(f"acc: {top_k_acc(multi_nb, k=1)*100:.2f}")
print(f"top 2: {top_k_acc(multi_nb, k=2)*100:.2f}")
print(f"top 3: {top_k_acc(multi_nb, k=3)*100:.2f}")
print(f"top 4: {top_k_acc(multi_nb, k=4)*100:.2f}")
print(f"top 5: {top_k_acc(multi_nb, k=5)*100:.2f}")

acc: 15.09
top 2: 22.07
top 3: 26.50
top 4: 29.77
top 5: 32.38


In [None]:
print(f"acc: {top_k_acc(nb, k=1)*100:.2f}")
print(f"top 2: {top_k_acc(nb, k=2)*100:.2f}")
print(f"top 3: {top_k_acc(nb, k=3)*100:.2f}")
print(f"top 4: {top_k_acc(nb, k=4)*100:.2f}")
print(f"top 5: {top_k_acc(nb, k=5)*100:.2f}")

acc: 13.04
top 2: 19.50
