In [104]:
import pandas as pd
import numpy as np
import json
import os
import random
from matplotlib import pyplot as plt
import seaborn as sns
from ggplot import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from bidict import bidict
from collections import Counter, defaultdict
from bidict import bidict
import pickle

In [92]:
%matplotlib inline
plt.style.use(['dark_background', 'ggplot'])
sns.set(color_codes=True)
sns.set_palette(sns.color_palette('dark'))

# Load Datasets

In [2]:
path550 = "/data/rali7/Tmp/solimanz/data/datasets/top550/"
path7k = "/data/rali7/Tmp/solimanz/data/datasets/reduced7000/"

# Load data dicts
with open(os.path.join(path550, "jobid", "data.json"), "r") as f:
    data550 = json.load(f)
with open(os.path.join(path7k, "jobid", "data.json"), "r") as f:
    data7k = json.load(f)

In [16]:
title_id550 = bidict(data550['title_to_id'])
title_id7k = bidict(data7k['title_to_id'])

In [139]:
labels550_counts = Counter([title_id550.inv[d[1][-1]] for d in data550['train_data']])
labels7k_counts = Counter([title_id7k.inv[d[1][-1]] for d in data7k['train_data']])

# Statistical Exploration

## Tokenize Labels

In [133]:
tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True)

In [134]:
labels550 = list(labels550_counts.keys())
labels7k = list(labels7k_counts.keys())

In [135]:
sw = set(stopwords.words('english') + ['', '&'])

In [136]:
vocab550 = set(tokenizer.tokenize(" ".join(labels550)))
vocab7k = set(tokenizer.tokenize(" ".join(labels7k)))

In [137]:
vocab550 = vocab550 - sw
vocab7k = vocab7k - sw

In [138]:
print(f"Number of possible labels in 550 dataset: {len(vocab550)}")
print(f"Number of possible labels in 7k dataset: {len(vocab7k)}")

Number of possible labels in 550 dataset: 326
Number of possible labels in 7k dataset: 1845


In [120]:
vocab550 = [w for w in tokenizer.tokenize(" ".join([title_id550.inv[d[1][-1]] for d in data550['train_data']])) if w not in sw]
vocab7k = [w for w in tokenizer.tokenize(" ".join([title_id7k.inv[d[1][-1]] for d in data7k['train_data']])) if w not in sw]

In [126]:
def normalize_counter(counts):
    total = sum(counts.values(), 0)
    for key in counts:
        counts[key] /= total
        counts[key] *= 100
    
    return counts

In [127]:
c = normalize_counter(Counter(vocab550))

In [50]:
def make_df(label_counts):
    df_dict = defaultdict(list)
    sw = set(stopwords.words('english'))
    for title, count in label_counts.items():
        df_dict['title'].append(title)
        df_dict['count'].append(count)        
        df_dict['n_words'].append(len([w for w in word_tokenize(title) if w not in sw]))
    return pd.DataFrame(df_dict)

In [53]:
df550 = make_df(labels550_counts)
df7k = make_df(labels7k_counts)

# Prepare Dataset

### Map Word Tokens to IDs

In [152]:
label_id_550 = bidict({label: id_ for id_, label in enumerate(vocab550)})
label_id_7k = bidict({label: id_ for id_, label in enumerate(vocab7k)})

In [170]:
def map_titles_to_labels(title_id, label_id):
    
    tokenizer = RegexpTokenizer(r'\s+|\W', gaps=True)
    title_labels = dict()
    
    for title in title_id.keys():
        toks = tokenizer.tokenize(title)
        labels = [label_id[t] for t in toks if t in label_id]
        title_labels[title_id[title]] = labels
    return title_labels

In [171]:
title_labels_550 = map_titles_to_labels(title_id550, label_id_550)
title_labels_7k = map_titles_to_labels(title_id7k, label_id_7k)

In [199]:
def make_dataset(data_dict, label_id, title_labels):
    data = dict()
    data['title_to_id'] = dict(data_dict['title_to_id'])
    data['label_id'] = dict(label_id)
    
    data['train_data'] = [[d[0], d[1][:-1]] for d in data_dict['train_data']]
    data['train_targets'] = [[d[0], title_labels[d[1][-1]]] for d in data_dict['train_data']]
    
    data['test_data'] = [[d[0], d[1][:-1]] for d in data_dict['test_data']]
    data['test_targets'] = [[d[0], title_labels[d[1][-1]]] for d in data_dict['test_data']]
    
    data['maximum_seq_len'] = data_dict['maximum_seq_len']
    data['n_labels'] = len(label_id)
    
    return data

In [203]:
data_550 = make_dataset(data550, label_id_550, title_labels_550)
data_7k = make_dataset(data7k, label_id_7k, title_labels_7k)

In [201]:
def save_data(data, dataset='top550'):
    with open(f"/data/rali7/Tmp/solimanz/data/datasets/multilabel/{dataset}/data.json", "w") as f:
        json.dump(data, f)

In [204]:
save_data(data_550, 'top550')
save_data(data_7k, 'reduced7k')

In [196]:
t = np.zeros((3,5), dtype=np.int32)

In [197]:
t[1,[2,0]] = 1

In [198]:
t

array([[0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int32)