In [1]:
import pandas as pd
import numpy as np

easy_train = pd.read_csv('./Data/EASY_TRAIN.csv', header=None)
easy_test = pd.read_csv('./Data/EASY_TEST.csv', header=None)

moderate_train = pd.read_csv('./Data/MODERATE_TRAIN.csv', header=None)
moderate_test = pd.read_csv('./Data/MODERATE_TEST.csv', header=None)

difficult_train = pd.read_csv('./Data/DIFFICULT_TRAIN.csv', header=None)
difficult_test = pd.read_csv('./Data/DIFFICULT_TEST.csv', header=None)

easy_blind = pd.read_csv('./Data/EASY_BLINDED.csv', header=None).loc[:,1:].values
moderate_blind = pd.read_csv('./Data/MODERATE_BLINDED.csv', header=None).loc[:,1:].values
difficult_blind = pd.read_csv('./Data/DIFFICULT_BLINDED.csv', header=None).loc[:,1:].values

easy_trainX = easy_train.loc[:,:25].values
easy_trainY = easy_train.loc[:,26].values
easy_testX = easy_test.loc[:,:25].values
easy_testY = easy_test.loc[:,26].values

moderate_trainX = moderate_train.loc[:,:25].values
moderate_trainY = moderate_train.loc[:,26].values
moderate_testX = moderate_test.loc[:,:25].values
moderate_testY = moderate_test.loc[:,26].values

difficult_trainX = difficult_train.loc[:,:51].values
difficult_trainY = difficult_train.loc[:,52].values
difficult_testX = difficult_test.loc[:,:51].values
difficult_testY = difficult_test.loc[:,52].values

labelMapping = {j:i for i,j in enumerate(np.unique(easy_trainY))}
labelMapping_reverse = {i:j for i,j in enumerate(np.unique(easy_trainY))}
easy_trainY_numeric = np.array([labelMapping[i] for i in easy_trainY])
easy_testY_numeric = np.array([labelMapping[i] for i in easy_testY])

moderate_trainY_numeric = np.array([labelMapping[i] for i in moderate_trainY])
moderate_testY_numeric = np.array([labelMapping[i] for i in moderate_testY])

difficult_trainY_numeric = np.array([labelMapping[i] for i in difficult_trainY])
difficult_testY_numeric = np.array([labelMapping[i] for i in difficult_testY])

In [44]:
TAG = 'MODERATE'

if TAG == 'EASY':
    X_train, y_train = easy_trainX, easy_trainY_numeric
    X_test, y_test = easy_testX, easy_testY_numeric
    X_blind = easy_blind
elif TAG == 'MODERATE':
    X_train, y_train = moderate_trainX, moderate_trainY_numeric
    X_test, y_test = moderate_testX, moderate_testY_numeric
    X_blind = moderate_blind
elif TAG == 'DIFFICULT':
    X_train, y_train = difficult_trainX, difficult_trainY_numeric
    X_test, y_test = difficult_testX, difficult_testY_numeric
    X_blind = difficult_blind

# Produce primary result

In [2]:
from libact.base.dataset import Dataset
from libact.models import LogisticRegression
from libact.query_strategies import UncertaintySampling, RandomSampling
from libact.labelers import IdealLabeler
from libact.query_strategies import ActiveLearningByLearning
from libact.query_strategies import HintSVM, QUIRE
from libact.query_strategies import UncertaintySampling, VarianceReduction
from libact.models import LogisticRegression
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly

import copy

n_labeled = 100

trn_ds = Dataset(X_train.astype('float64'), np.concatenate([y_train[:n_labeled].astype('float64'), [None] * (len(y_train) - n_labeled)]))
tst_ds = Dataset(X_test.astype('float64'), y_test)
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)
fully_labeled_trn_ds = Dataset(X_train, y_train)

lbr = IdealLabeler(fully_labeled_trn_ds)

model = LogisticRegression()

qs_us_sm = UncertaintySampling(trn_ds, method='sm', model=LogisticRegression())
qs_us_lc = UncertaintySampling(trn_ds2, method='lc', model=LogisticRegression())
qs_rnd = RandomSampling(trn_ds3)
# qs_vr = VarianceReduction(trn_ds3, model=LogisticRegression())


def run(trn_ds, tst_ds, lbr, model, qs, quota):
    E_in, E_out = [], []

    i = 0
    for _ in range(quota):
        if i % 100 == 0:
            print i
        # Standard usage of libact objects
        ask_id = qs.make_query()
        X, _ = zip(*trn_ds.data)
        lb = lbr.label(X[ask_id])
        trn_ds.update(ask_id, lb)

        model.train(trn_ds)
        E_in = np.append(E_in, 1 - model.score(trn_ds))
        E_out = np.append(E_out, 1 - model.score(tst_ds))
        i += 1

    return E_in, E_out


model = LogisticRegression()


quota = 2400

# quota = len(y_train) - n_labeled    # number of samples to query

# Comparing UncertaintySampling strategy with RandomSampling.
# model is the base learner, e.g. LogisticRegression, SVM ... etc.

E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs_us_sm, quota)
E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs_us_lc, quota)
E_in_3, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs_rnd, quota)


# Plot the learning curve of UncertaintySampling to RandomSampling
# The x-axis is the number of queries, and the y-axis is the corresponding
# error rate.

plotly.tools.set_credentials_file(username='tonyabracadabra', api_key='6gs9i5iec7')


trace_train_sm = go.Scatter(
    x = range(1, len(E_in_1)),
    y = E_in_1,
    mode = 'lines+markers',
    name = 'Train error / uncertainty sampling (SM)'
)

trace_train_lc = go.Scatter(
    x = range(1, len(E_in_2)),
    y = E_in_2,
    mode = 'lines+markers',
    name = 'Train error / uncertainty sampling (LC)'
)

trace_train_random = go.Scatter(
    x = range(1, len(E_in_3)),
    y = E_in_3,
    mode = 'lines+markers',
    name = 'Train error / random'
)

trace_test_sm = go.Scatter(
    x = range(1, len(E_in_1)),
    y = E_out_1,
    mode = 'lines+markers',
    name = 'Test error / uncertainty sampling (SM)'
)

trace_test_lc = go.Scatter(
    x = range(1, len(E_in_2)),
    y = E_out_2,
    mode = 'lines+markers',
    name = 'Test error / uncertainty sampling (LC)'
)

trace_test_random = go.Scatter(
    x = range(1, len(E_in_3)),
    y = E_out_3,
    mode = 'lines+markers',
    name = 'Test error / random'
)


a = [trace_train_sm,trace_train_lc,trace_train_random,trace_test_sm,trace_test_lc,trace_test_random]

py.iplot(a, filename='plot')

In [6]:
trn_ds_ori = Dataset(X_train.astype('float64'), \
np.concatenate([y_train[:n_labeled].astype('float64'), [None] * (len(y_train) - n_labeled)]))

# Different regularization terms

In [1]:
trn_ds_ori = Dataset(X_train.astype('float64'), \
np.concatenate([y_train[:n_labeled].astype('float64'), [None] * (len(y_train) - n_labeled)]))

trn_dss = [copy.deepcopy(trn_ds_ori) for i in xrange(5)]
qs_albl = ActiveLearningByLearning(
         trn_dss[0], # Dataset object
         query_strategies=[
             UncertaintySampling(trn_dss[0], method='sm', model=LogisticRegression(C=1.)),
             UncertaintySampling(trn_dss[0], method='sm', model=LogisticRegression(C=.01)),
             UncertaintySampling(trn_dss[0], method='sm', model=LogisticRegression(C=.1))
             ],
    model=LogisticRegression(),
    T = 2400
)


qs_us_sm_1 = UncertaintySampling(trn_dss[1], method='sm', model=LogisticRegression(C=1.))
qs_us_sm_01 = UncertaintySampling(trn_dss[2], method='sm', model=LogisticRegression(C=.1))
qs_us_sm_001 = UncertaintySampling(trn_dss[3], method='sm', model=LogisticRegression(C=.01))
qs_random = RandomSampling(trn_dss[4])

qss = [qs_albl, qs_us_sm_1, qs_us_sm_01, qs_us_sm_001, qs_random]

labels = ['ALBL','C = 1', 'C = 0.1','C = 0.01', 'random']

traces = []
for i, qs in enumerate(qss):
    trn_ds = trn_dss[i]
    E_in, E_out = run(trn_ds, tst_ds, lbr, model, qs, quota)
    trace_train = go.Scatter(
    x = range(1, len(E_in)),
    y = E_in,
    mode = 'lines+markers',
    name = 'Train error / ' + labels[i]
    )
    trace_test = go.Scatter(
    x = range(1, len(E_in)),
    y = E_out,
    mode = 'lines+markers',
    name = 'Test error / '+ labels[i]
    )
    traces.append(trace_train)
    traces.append(trace_test)

py.iplot(traces, filename='plot')

# Predict by XGBoost

In [22]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

selected = np.array([i[1] is not None for i in trn_ds.data])

X_selected = np.asarray(np.array(trn_ds.data)[selected,:][:,0].tolist())
y_selected = np.asarray(np.array(trn_ds.data)[selected,:][:,1].tolist())

gbm = xgb.XGBClassifier(max_depth=8, n_estimators=400, learning_rate=0.05, colsample_bytree=0.3, \
                        gamma=0.15, subsample=0.5).fit(X_selected,y_selected)
# .fit(X_selected, y_selected)
indexes_xgboost = np.argsort(gbm.feature_importances_)[::-100]

y_pred_test = gbm.predict(X_test)
y_pred_train = gbm.predict(X_train)

y_pred_blind = gbm.predict(X_blind)

print(accuracy_score(y_test, y_pred_test))
print(accuracy_score(y_train, y_pred_train))

y_pred_blind_label = [labelMapping_reverse[i] for i in y_pred_blind]

output_blind = pd.DataFrame(np.transpose(np.vstack([np.array(range(1,y_pred_blind.shape[0]+1)),y_pred_blind_label])))
output_blind.to_csv(TAG+"_BLINDED_PRED.csv",header=False,index=False)

'Train error / ALBL'

In [3]:
from libact.query_strategies import *
import copy

trn_dss_2 = [copy.deepcopy(trn_ds_ori) for i in xrange(3)]

qs_ct = QueryByCommittee(
        trn_dss_2[0],
        models=[
        LogisticRegression(C=1.0),
        LogisticRegression(C=.1),
        LogisticRegression(C=.01)
        ]
)

qs_albl = ActiveLearningByLearning(
         trn_dss_2[1], # Dataset object
         query_strategies=[
             UncertaintySampling(trn_dss_2[1], method='sm', model=LogisticRegression(C=1.)),
             UncertaintySampling(trn_dss_2[1], method='sm', model=LogisticRegression(C=.01)),
             UncertaintySampling(trn_dss_2[1], method='sm', model=LogisticRegression(C=.1))
             ],
    model=LogisticRegression(),
    T = 2400
)

qs_random = RandomSampling(trn_dss_2[2])

qss_2 = [qs_ct, qs_albl, qs_random]
labels_2 = ["Commitee", "ALBL", "random"]

traces_2 = []
for i, qs in enumerate(qss_2):
    trn_ds = trn_dss_2[i]
    E_in, E_out = run(trn_ds, tst_ds, lbr, model, qs, quota)
    trace_train = go.Scatter(
    x = range(1, len(E_in)),
    y = E_in,
    mode = 'lines+markers',
    name = 'Train error / ' + labels_2[i]
    )
    trace_test = go.Scatter(
    x = range(1, len(E_in)),
    y = E_out,
    mode = 'lines+markers',
    name = 'Test error / '+ labels_2[i]
    )
    traces_2.append(trace_train)
    traces_2.append(trace_test)

py.iplot(traces_2, filename='plot')