In [None]:
from __future__ import division

import os
import numpy as np
import time, timeit
import signal
import scipy.io as scio
from scipy import stats
from scipy.sparse import coo_matrix
from MDPD import *
from MDPD.readers import *
from MDPD import utils
import matplotlib.pyplot as plt
import matplotlib

----
## Read Data

In [None]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/bird'
train = Crowd_Sourcing_Readers.read_data(os.path.join(folder, 'bluebird_crowd.txt'))
label = Crowd_Sourcing_Readers.read_label(os.path.join(folder, 'bluebird_truth.txt'))
lock = np.zeros(train.shape[1:])

In [None]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/dog'
train = Crowd_Sourcing_Readers.read_data(os.path.join(folder, 'dog_crowd.txt'))
label = Crowd_Sourcing_Readers.read_label(os.path.join(folder, 'dog_truth.txt'))


In [None]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/rte'
train = Crowd_Sourcing_Readers.read_data(os.path.join(folder, 'rte_crowd.txt'))
label = Crowd_Sourcing_Readers.read_label(os.path.join(folder, 'rte_truth.txt'))
lock = np.zeros(train.shape[1:])
lock[:, -1] = 1

In [None]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/trec'
train = Crowd_Sourcing_Readers.read_data(os.path.join(folder, 'trec_crowd.txt'))
label = Crowd_Sourcing_Readers.read_label(os.path.join(folder, 'trec_truth.txt'))

In [None]:
folder = '/media/vzhao/Data/crowdsourcing_datasets/web'
train = Crowd_Sourcing_Readers.read_data(os.path.join(folder, 'web_crowd.txt'))
label = Crowd_Sourcing_Readers.read_label(os.path.join(folder, 'web_truth.txt'))
lock = np.zeros(train.shape[1:])
lock[:, -1] = 1

---
## Analysis

### Original Mutual Information Residue

In [None]:
# Original mutual information residue (G score)
score = MDPD.utils.Feature_Selection.MI_score(train, rm_diag=True, lock=lock)
dim = train.shape[1]
print np.sum(score) / (dim * (dim-1))

In [None]:
score, weights = MDPD.utils.Feature_Selection.MI_score_conditional(train, )

In [None]:
# label to log_post
def label2logpost(label, ncomp):
    nsample = label.shape[0]
    post = np.zeros((nsample, ncomp))
    for i in xrange(nsample):
        post[i, label[i]] = 1
    return np.log(post)
log_post = label2logpost(label,2)

In [None]:
score, weighted = MDPD.utils.Feature_Selection.MI_score_conditional(train, log_post, rm_diag=True)

In [None]:
ss = score.sum(axis=1) * weighted[np.newaxis, :]

In [None]:
ss.shape

In [None]:
plt.plot(ss[features,0]/38)
plt.plot(ss[features,1]/38)
plt.plot(score/38, '--')
features

### Reference G Statistics

In [None]:
percentages = [99,95,90,75,50]
percentiles = [stats.chi2.ppf(x/100.,3) / (2 * train.shape[0]) for x in percentages]

print 'Reference G statistis at {} percentile'.format(percentages)
print percentiles

### Mixture Model with Feature Selection Performance

#### Feature Ranking

In [None]:
features, score = utils.Feature_Selection.MI_feature_ranking(train)
plt.plot(score/(train.shape[1]-1))
# plot reference G statistics
for foo in percentiles:
    plt.plot([0, len(score)], [foo, foo], 'c--')
features

#### Accuracy and Mutual Information Residue

In [None]:
Ntop = 9

In [None]:
# Feature Selection
model = MDPD.MDPD()
model.fit(train, ncomp=5, init='majority', verbose=False, features=features[:Ntop], niter=50, lock=lock)
model.accuracy(train, label)
model.MI_residue(train, lock)

In [None]:
# optional
model.change_features(train, features=range(model.dim))
model.accuracy(train, label)

### Vanilla Model Performance
#### Accuracy

In [None]:
# basic model with majority vote initialization
model_basic = MDPD.MDPD()
model_basic.fit(train, ncomp=4, init='majority', verbose=False, niter=50, lock=lock)
model_basic.accuracy(train, label)

#### Mutual Information Residue

In [None]:
model_basic.MI_residue(train, lock)
# MI residue conditional on the features (although basic model is using all features to learn the model)
features, score = utils.Feature_Selection.MI_feature_ranking(train)
features = np.array(features)
log_post = model_basic.log_posterior(train)
score, weights = utils.Feature_Selection.MI_score_conditional(train,log_post,rm_diag=True, lock=lock)
score_selected = score[features[:Ntop, np.newaxis], features[:Ntop]]
print 'MI residue conditional on the features (although basic model is using all features to learn the model)'
print np.sum(score_selected.sum(axis=(0,1)) * weights) / (Ntop * (Ntop - 1))

### 