In [1]:
from __future__ import division

import os
import numpy as np
import time, timeit
import signal
import scipy.io as scio
from scipy import stats
from scipy.sparse import coo_matrix
from MDPD.readers import *
from MDPD import utils, MDPD
import matplotlib.pyplot as plt
import matplotlib

----
## Read Data

In [2]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/'
# folder = '/Users/vincent/Documents/Research/MDPD/crowdsourcing_datasets'

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'bird', 'bluebird_crowd.txt'), os.path.join(folder, 'bird', 'bluebird_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:], dtype=np.bool)
print(train.shape)

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'dog', 'dog_crowd.txt'), os.path.join(folder, 'dog', 'dog_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print train.shape

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'rte', 'rte_crowd.txt'), os.path.join(folder, 'rte', 'rte_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print(train.shape)

In [None]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'trec', 'trec_crowd.txt'), os.path.join(folder, 'trec', 'trec_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print train.shape

In [3]:
reader = Crowd_Sourcing_Readers(os.path.join(folder, 'web', 'web_crowd.txt'), os.path.join(folder, 'web', 'web_truth.txt'))
train, label = reader.data, reader.labels
lock = np.zeros(train.shape[1:],dtype=np.bool)
lock[:, -1] = 1
print(train.shape)

2018-05-16 22:19:00,485 : INFO : Data has missing values. A new label is created to represent the missing values.


(2653, 177, 6)


#### Update global variables

In [4]:
NSAMPLE, DIM, NVOCAB = train.shape
EFF_NVOCAB = NVOCAB-1 if reader.is_missing_value else NVOCAB

---
# Analysis

## Original Mutual Information Residue

In [None]:
%%time
# Original mutual information residue (G score)
score_origin = utils.Feature_Selection.MI_score(train, rm_diag=True, lock=lock)
sigma_origin = score_origin.sum(axis=1)
print(np.sum(score_origin) / (DIM * (DIM-1)))

#### Reference G Statistics

In [None]:
percentages = [99,95,90,75,50]
percentiles = [stats.chi2.ppf(x/100., (EFF_NVOCAB**2 - 1)) / (2 * NSAMPLE) for x in percentages]
print('Reference G statistis at {} percentile'.format(percentages))
print(percentiles)

---
## Mutual Information Residue if use the true label as the posterior distribution

In [None]:
# label to log_post
def label2logpost(label, ncomp):
    nsample = label.shape[0]
    post = np.zeros((nsample, ncomp))
    for i in range(nsample):
        post[i, label[i]] = 1
    return np.log(post)
log_post = label2logpost(label,label.max()+1)
utils.log_replace_neginf(log_post)

In [None]:
%%time
score, weights = MDPD.utils.Feature_Selection.MI_score_conditional(train, log_post, rm_diag=True, lock=lock)
score_condition = score.sum(axis=1)
print('Mutual Information Residue if use the true label as the posterior distribution')
print(np.sum(score_condition * weights[np.newaxis, :]) / (DIM * (DIM - 1)))

#### [Plot] Mutual Information Residue if use the true label as the posterior distribution vs. Raw Residue

In [None]:
plt.figure()
idx = np.argsort(sigma_origin)[::-1]
for k in range(train.shape[2]-1 if np.any(lock) else train.shape[2]):
    plt.plot(score_condition[idx,k]/(DIM - 1))
plt.plot(sigma_origin[idx] / (DIM - 1), '--')
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

In [None]:
plt.figure()
plt.plot(sigma_origin[idx] / (DIM - 1), '--')
plt.plot(np.sum(score_condition[idx,:] * weights[np.newaxis, :], axis=1) / (DIM - 1))
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

---
## Mixture Model with Feature Selection

#### Feature Ranking

In [5]:
%%time
score = utils.Feature_Selection.MI_score(train, lock=lock)
sigma = score.sum(axis=1)
features = np.argsort(sigma)[::-1]
sigma = sigma[features]
# features, sigma = utils.Feature_Selection.MI_feature_ranking(train, lock=lock)
print(features)

[  2   7   0  11  12   6   1  15  25  13  10   3  44  32 102   5  69  38
  29  36   8  62 106  58 103  56  46  27  24  23  52  51  28  41  21  31
  74  68  33  35  77  99  75  64 114  50  82  17 142 109 117  65 107 100
  30 129  93  19  26 105  40  86  98  90  59  37  91 112  78  34  81  67
 120 126   4  70 134  61   9  83  47  18 130  71  49 135 124  96 127  92
 119  72  88 104  48 108  55  85 144  14 118  16  39  87 101  60 139  89
  54 155 140 132  63  84  94 113 138 145  95 158  22  42 116 165  80  57
  45  66 164 143  53 147  76 125 161 123 128 153 141 121  79  20  97 131
 111 160 122  43 157  73 115 133 110 156 148 150 173 137 146 162 168 159
 175 169 167 176 163 171 136 172 166 151 154 170 149 174 152]
CPU times: user 416 ms, sys: 41.3 ms, total: 457 ms
Wall time: 165 ms


  log_second_scaled = np.log(second_masked)


#### Accuracy and Mutual Information Residue

In [6]:
Ntop = 15

In [7]:
# Feature Selection
model = MDPD.MDPD_standard()
model.fit(train, 5, features=features[:Ntop], init='majority', verbose=True, epoch=50, lock=lock)
model.accuracy(train, label)
model.MI_residue(train)

2018-05-16 22:19:48,063 : INFO : Training an MDPD using batch EM 
	 dimension 177 
	 15 features 
	 sample size 2653 
	 vocab size 6 
	 the target number of components 5
  log_votes = np.log(votes)
  log_second = np.log(second)
  pmi = second * (log_second - log_first)
2018-05-16 22:19:49,915 : INFO : iteration 0; log-likelihood (feature selection) 97.025492; log_likelihood 569.561290;information residue 0.001060
2018-05-16 22:19:51,639 : INFO : iteration 1; log-likelihood (feature selection) 97.034741; log_likelihood 569.579014;information residue 0.001064
2018-05-16 22:19:53,304 : INFO : iteration 2; log-likelihood (feature selection) 97.037768; log_likelihood 569.584658;information residue 0.001066
2018-05-16 22:19:55,202 : INFO : iteration 3; log-likelihood (feature selection) 97.039161; log_likelihood 569.587125;information residue 0.001067
2018-05-16 22:19:57,076 : INFO : iteration 4; log-likelihood (feature selection) 97.039917; log_likelihood 569.588445;information residue 0.00

2018-05-16 22:21:23,120 : INFO : The mutual information residue (include all features) is 0.02540336806757374
2018-05-16 22:21:23,122 : INFO : The mutual information residue (within selected features) is 0.11764949348692147


In [None]:
model.save('tmp.p')

In [None]:
# optional
model.change_features(train, features=range(model.dim))
model.accuracy(train, label)

#### [Plot] Mutual Information Residue vs the Residue of the Raw Data

In [None]:
log_post = model.log_posterior(train)
score, weights = utils.Feature_Selection.MI_score_conditional(train, log_post, rm_diag=True, lock=lock)
sigma_condition = score.sum(axis=1)
print 'Mutual Information Residue of the model with feature selection'
print np.sum(sigma_condition * weights[np.newaxis, :]) / (DIM * (DIM - 1))

In [None]:
plt.figure()
idx = np.argsort(sigma_origin)[::-1]
for k in xrange(train.shape[2]-1 if np.any(lock) else train.shape[2]):
    plt.plot(score_condition[idx,k]/(DIM-1))
plt.plot(sigma_origin[idx] / (DIM-1), '--')
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

In [None]:
plt.figure()
plt.plot(sigma_origin[idx] / (DIM-1), '--')
plt.plot(np.sum(score_condition[idx, :] * weights[np.newaxis, :], axis=1) / (DIM-1))
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

## Vanilla Model Performance
#### Accuracy and Mutual Information Residue

In [None]:
# basic model with majority vote initialization
model_basic = MDPD.MDPD_standard()
model_basic.fit(train, ncomp=EFF_NVOCAB, init='majority', verbose=False, epoch=50, lock=lock)
model_basic.accuracy(train, label)
model_basic.MI_residue(train)

In [None]:
log_post = model_basic.log_posterior(train)
score, weights = MDPD.utils.Feature_Selection.MI_score_conditional(train, log_post, rm_diag=True, lock=lock)
score_condition = score.sum(axis=1)
print 'Mutual Information Residue if use the true label as the posterior distribution'
print np.sum(score_condition * weights[np.newaxis, :]) / (DIM * (DIM - 1))

print 'Mutual Information Residue (within the selected features)'
score_select = score[features[:Ntop, np.newaxis], features[:Ntop], :]
res_select = np.sum(score_select.sum(axis=1) * weights[np.newaxis, :]) / (Ntop * (Ntop - 1))
print res_select

#### [Plot] Mutual Information Residue vs the Residue of the Raw Data

In [None]:
plt.figure()
idx = np.argsort(sigma_origin)[::-1]
for k in xrange(train.shape[2]-1 if np.any(lock) else train.shape[2]):
    plt.plot(score_condition[idx,k]/(DIM - 1))
plt.plot(sigma_origin[idx] / (DIM - 1), '--')
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

In [None]:
plt.figure()
plt.plot(sigma_origin[idx] / (DIM - 1), '--')
plt.plot(np.sum(score_condition[idx,:] * weights[np.newaxis, :], axis=1) / (DIM - 1))
# plot reference G statistics
for foo in percentiles[:3]:
    plt.plot([0, len(score)], [foo, foo], 'c--')
plt.show()

---
## Generate Image

#### NIPS 2018

Missing Values

In [None]:
plt.imshow(1-train[..., -1], origin='lower', cmap='PuBu')
plt.axis('normal')
plt.xlabel('Worker Index', fontsize=15)
plt.ylabel('Item Index', fontsize=15)

In [None]:
train.shape