In [1]:
from __future__ import division

import os
import numpy as np
import time, timeit
import signal
import scipy.io as scio
from scipy import stats
from scipy.sparse import coo_matrix
from MDPD import *
from MDPD.readers import *
from MDPD import utils
import matplotlib.pyplot as plt
import matplotlib

----
## Read Data

In [2]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/'
# folder = '/Users/vincent/Documents/Research/MDPD/crowdsourcing_datasets'

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'bird', 'bluebird_crowd.txt'), os.path.join(folder, 'bird', 'bluebird_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:], dtype=np.bool)
print train.shape

In [3]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'dog', 'dog_crowd.txt'), os.path.join(folder, 'dog', 'dog_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print train.shape

2018-04-04 17:36:26,992 : INFO : Data has missing values. A new label is created to represent the missing values.


(807, 109, 5)


In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'rte', 'rte_crowd.txt'), os.path.join(folder, 'rte', 'rte_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print train.shape

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'trec', 'trec_crowd.txt'), os.path.join(folder, 'trec', 'trec_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print train.shape

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'web', 'web_crowd.txt'), os.path.join(folder, 'web', 'web_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print train.shape

---
## Analysis

### Original Mutual Information Residue

In [4]:
%%time
# Original mutual information residue (G score)
score_origin = utils.Feature_Selection.MI_score(train, rm_diag=True, lock=lock)
sigma_origin = score_origin.sum(axis=1)
dim = train.shape[1]
print np.sum(score_origin) / (dim * (dim-1))

7.624746746624678e+37
CPU times: user 75.1 ms, sys: 65.1 ms, total: 140 ms
Wall time: 50.3 ms


#### Reference G Statistics

In [None]:
percentages = [99,95,90,75,50]
percentiles = [stats.chi2.ppf(x/100.,3) / (2 * train.shape[0]) for x in percentages]

print 'Reference G statistis at {} percentile'.format(percentages)
print percentiles

### Mutual Information Residue if use the true label as the posterior distribution

In [None]:
# label to log_post
def label2logpost(label, ncomp):
    nsample = label.shape[0]
    post = np.zeros((nsample, ncomp))
    for i in xrange(nsample):
        post[i, label[i]] = 1
    return np.log(post)
log_post = label2logpost(label,label.max()+1)
utils.log_replace_neginf(log_post)

In [None]:
%%time
score, weighted = MDPD.utils.Feature_Selection.MI_score_conditional(train, log_post, rm_diag=True, lock=lock)
score_condition = score.sum(axis=1)
print 'Mutual Information Residue if use the true label as the posterior distribution'
print np.sum(score_condition * weighted[np.newaxis, :]) / (dim * (dim - 1))

### [Plot] Mutual Information Residue if use the true label as the posterior distribution vs. Raw Residue

In [None]:
plt.figure()
idx = np.argsort(sigma_origin)[::-1]
for k in xrange(train.shape[2]-1 if np.any(lock) else train.shape[2]):
    plt.plot(score_condition[idx,k]/(dim-1))
plt.plot(sigma_origin[idx] / (dim-1), '--')
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

In [None]:
plt.figure()
plt.plot(sigma_origin[idx] / (dim-1), '--')
plt.plot(np.sum(score_condition[idx,:] * weighted[np.newaxis, :], axis=1) / (dim-1))
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

### Mixture Model with Feature Selection Performance

#### Feature Ranking

In [None]:
%%time
features, score = utils.Feature_Selection.MI_feature_ranking(train)

#### Accuracy and Mutual Information Residue

In [None]:
Ntop = 15

In [None]:
# Feature Selection
model = MDPD.MDPD()
model.fit(train, ncomp=5, init='majority', verbose=False, features=features[:Ntop], epoch=50, lock=lock)
model.accuracy(train, label)
model.MI_residue(train)

In [None]:
# optional
model.change_features(train, features=range(model.dim))
model.accuracy(train, label)

#### [Plot] Mutual Information Residue vs the Residue of the Raw Data

In [None]:
log_post = model.log_posterior(train)
score, weighted = MDPD.utils.Feature_Selection.MI_score_conditional(train, log_post, rm_diag=True, lock=lock)
score_condition = score.sum(axis=1)
# sigmas = score.sum(axis=1) * weighted[np.newaxis, :]
print 'Mutual Information Residue if use the true label as the posterior distribution'
print np.sum(score_condition * weighted[np.newaxis, :]) / (dim * (dim - 1))

In [None]:
plt.figure()
idx = np.argsort(sigma_origin)[::-1]
for k in xrange(train.shape[2]-1 if np.any(lock) else train.shape[2]):
    plt.plot(score_condition[idx,k]/(dim-1))
plt.plot(sigma_origin[idx] / (dim-1), '--')
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

In [None]:
plt.figure()
plt.plot(sigma_origin[idx] / (dim-1), '--')
plt.plot(np.sum(score_condition[idx, :] * weighted[np.newaxis, :], axis=1) / (dim-1))
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

### Vanilla Model Performance
#### Accuracy

In [None]:
# basic model with majority vote initialization
model_basic = MDPD.MDPD()
model_basic.fit(train, ncomp=4, init='majority', verbose=False, epoch=50, lock=lock)
model_basic.accuracy(train, label)

#### Mutual Information Residue

In [None]:
model_basic.MI_residue(train)
# MI residue conditional on the features (although basic model is using all features to learn the model)
features, score = utils.Feature_Selection.MI_feature_ranking(train)
features = np.array(features)
log_post = model_basic.log_posterior(train)
score, weights = utils.Feature_Selection.MI_score_conditional(train,log_post,rm_diag=True, lock=lock)
score_selected = score[features[:Ntop, np.newaxis], features[:Ntop]]
print 'MI residue conditional on the features (although basic model is using all features to learn the model)'
print np.sum(score_selected.sum(axis=(0,1)) * weights) / (Ntop * (Ntop - 1))