In [1]:
from WikiCatUtils import (
    Cache,
    read_categories,
    get_fullpath,
    resample_to_equal)
from os.path import exists
from os import mkdir
import numpy as np

if not exists('/tmp/WikiCat'):                                                               
    mkdir('/tmp/WikiCat')                                                                    
categories_to_download = read_categories(get_fullpath('example_cats.txt'))                 
cache = Cache(get_fullpath('/tmp/WikiCat/cache'), verbosity=0)                            
for category_uri in categories_to_download:                                                  
    cache.loadCategory(category_uri, only_use_cached=True)# maxlinks=100)


dset, label_map = cache.get_dataset(0.6, 0.2, 0.2)

  if 'order' in inspect.getargspec(np.copy)[0]:


In [2]:
# dataset statistics
print({split:len(dset[split][0]) for split in dset})
reverse_label_map = {label_map[category]:category for category in label_map}
indices = reverse_label_map.keys()
print(len(cache.contents))
print("\t%s" % ('\t'.join(map(str,indices))))
for split in dset:
    category_percentages = []
    for category_ind in indices:
        category_percentages.append(dset[split][1].count(category_ind)
                                    / len(dset[split][1]))
    print("%s\t%s" % (split, '\t'.join(map(lambda x: "%0.2f" % x, category_percentages))))

print(reverse_label_map)

{'train': 634, 'test': 215, 'val': 213}
1062
	0	1	2	3	4	5	6
test	0.03	0.03	0.06	0.05	0.10	0.10	0.63
train	0.03	0.03	0.06	0.05	0.10	0.10	0.64
val	0.03	0.03	0.06	0.05	0.10	0.10	0.63
{0: 'Category:Organs (anatomy)', 1: 'Category:Cancer', 2: 'Category:Medical devices', 3: 'Category:Machine learning algorithms', 4: 'Category:Infectious diseases', 5: 'Category:Congenital disorders', 6: 'Category:Rare diseases'}


In [3]:
# this is extremely unbalanced, but we'll see how well it does despite that
# let's try BoW first as a baseline

import re
from string import punctuation
punct_matcher = re.compile(r'[{}]+'.format(re.escape(punctuation)))
space_matcher = re.compile(r'\s+')
def tokenize(article):
    # instead of tokenizing by anything fancy we'll just separate by spaces
    # and remove punctuation / quotes / hyphens
    stripped = article.rstrip().lstrip()
    no_punct = re.sub(punct_matcher, '', stripped)
    tokenized = re.split(space_matcher, no_punct)
    return tokenized

def make_vocab(articles):
    vocab = {}
    for article in articles:
        tokenized_article = tokenize(article)
        for word in tokenized_article:
            if word not in vocab:
                vocab[word] = 1
            else:
                vocab[word] += 1
    return vocab

def convert_classes(labels):
    # most ml packages expect multiclass labels to be one-hot encoded
    n_classes = max(labels) + 1
    label_indices = list(zip(*[(ind, class_ind) for ind, class_ind in enumerate(labels)]))
    label_mat = np.zeros((len(labels), n_classes))
    label_mat[label_indices] = 1
    return label_mat


#train_vocab = make_vocab(dset['train'][0])
#print(len(train_vocab))
class BoW_transformer:
    def __init__(self, minsample=5, tfidf_weights=False):
        self.use_tfidf = tfidf_weights    
        self.minsample = minsample
        
    def get_idf(self, articles):
        # for every word in the vocab, get its inverse doc freq
        self.idf = np.zeros((1,self.vocab_size))
        vocab_appears_in = {word:np.zeros(len(articles), dtype=np.int) 
                            for word in self.lookup}
        for ind, article in enumerate(articles):
            for word in tokenize(article):
                if word in self.lookup:
                    vocab_appears_in[word][ind] = 1
        for word in self.lookup:
            self.idf[:,self.lookup[word]] = np.log(1 + len(articles) / np.sum(
                                                vocab_appears_in[word]))
            
    def fit(self, articles):
        # figure out the vocab and tfidf stuff
        self.vocab = make_vocab(articles)
        vocab_sorted = sorted(self.vocab.keys())
        vocab_minsampled = list(filter(lambda x: self.vocab[x] >= self.minsample, 
                                       vocab_sorted))
        self.lookup = {word:ind for ind, word in enumerate(vocab_minsampled)}
        self.vocab_size = len(self.lookup)
        if self.use_tfidf:
            self.get_idf(articles)
    
    def transform_single(self, tokenized_article):
        bowvec = np.zeros((1,self.vocab_size), dtype=np.float32)
        for word in tokenized_article:
            if word in self.lookup:
                word_ind = self.lookup[word]
                bowvec[:,word_ind] = 1
        if self.use_tfidf:
            # implement the log-scaled frequency
            bowvec = np.log(1 + bowvec)
            bowvec *= self.idf
        return bowvec
    
    def transform(self, articles):
        return np.vstack([self.transform_single(tokenize(article)) 
                          for article in articles])
                
bow_maker = BoW_transformer()
bow_maker.fit(dset['train'][0])
train_bow = bow_maker.transform(dset['train'][0])
train_labels = convert_classes(dset['train'][1])
print(train_bow.shape)

(634, 11091)


In [4]:
#print(tokenize(dset['train'][0][0]))
#print(dset['train'][0][0])

In [5]:
from sklearn.linear_model import LogisticRegression
bow_linear = LogisticRegression()
bow_linear.fit(train_bow, dset['train'][1])

  args, varargs, kw, default = inspect.getargspec(init)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [6]:
print(bow_linear.score(train_bow, dset['train'][1]))
# This result (~100% accuracy) shows the inherent issue with having a ton of features
# and very little data -- obviously we're well beyond LogisticRegression's VC dimension
print(np.histogram(bow_linear.predict(train_bow), bins=train_labels.shape[1])[0])
# this is real bad

1.0
[ 18  20  36  31  61  63 405]


In [7]:
# We can very easily see the overfitting here
val_bow = bow_maker.transform(dset['val'][0])
val_labels = convert_classes(dset['val'][1])

print(np.histogram(bow_linear.predict(val_bow), bins=val_labels.shape[1])[0])
print(np.histogram(np.argmax(val_labels, axis=1), bins=val_labels.shape[1])[0])
print(bow_linear.score(val_bow, dset['val'][1]))

[  4   3  10  11  16   6 163]
[  6   7  12  11  21  21 135]
0.821596244131


In [8]:
# Despite the overfitting it's worth noting that it performs rather well.
# Let's see how this holds up under cross validation
from sklearn.cross_validation import cross_val_score
cv_scores = cross_val_score(LogisticRegression(), train_bow, dset['train'][1], cv=5)
print(np.average(cv_scores))
print(np.std(cv_scores))

  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)


0.839497516657
0.0380556018582


  args, varargs, kw, default = inspect.getargspec(init)


In [9]:
# So it seems to be getting ~84% accuracy pretty regularly. We'll see if we can improve
# it using TF-IDF weights -- however we need to be careful since we need to learn these 
# weights only on the training set.

In [10]:
bow_tfidf_maker = BoW_transformer(tfidf_weights=True)
bow_tfidf_maker.fit(dset['train'][0])
train_bow_tfidf = bow_tfidf_maker.transform(dset['train'][0])


bow_linear_tfidf = LogisticRegression()
bow_linear_tfidf.fit(train_bow_tfidf, dset['train'][1])

  args, varargs, kw, default = inspect.getargspec(init)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [11]:
bow_linear_tfidf.score(train_bow_tfidf, dset['train'][1])

1.0

In [12]:
val_bow_tfidf = bow_tfidf_maker.transform(dset['val'][0])
bow_linear_tfidf.score(val_bow_tfidf, dset['val'][1])

0.82629107981220662

In [None]:
# just to be sure we'll make a simple pipeliner that can take the representer and learner
# in one so we can try out 