In [23]:
import numpy as np
import torch
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from PIL import Image

import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker


In [24]:
# datasets
def get_dataset(name):
    if name == 'MNIST':
        return get_MNIST()
    elif name == 'FashionMNIST':
        return get_FashionMNIST()
    

def get_MNIST():
    raw_tr = datasets.MNIST('./MNIST', train=True, download=True)
    raw_te = datasets.MNIST('./MNIST', train=False, download=True)
    X_tr = raw_tr.train_data
    Y_tr = raw_tr.train_labels
    X_te = raw_te.test_data
    Y_te = raw_te.test_labels
    return X_tr, Y_tr, X_te, Y_te


def get_FashionMNIST():
    raw_tr = datasets.FashionMNIST('./FashionMNIST', train=True, download=True)
    raw_te = datasets.FashionMNIST('./FashionMNIST', train=False, download=True)
    X_tr = raw_tr.train_data
    Y_tr = raw_tr.train_labels
    X_te = raw_te.test_data
    Y_te = raw_te.test_labels
    return X_tr, Y_tr, X_te, Y_te


def get_handler(name):
    if name == 'MNIST':
        return DataHandler1
    elif name == 'FashionMNIST':
        return DataHandler1

class DataHandler1(Dataset):
    def __init__(self, X, Y, transform=None):
        self.X = X
        self.Y = Y
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.X[index], self.Y[index]
        if self.transform is not None:
            x = Image.fromarray(x.numpy(), mode='L')
            x = self.transform(x)
        return x, y, index

    def __len__(self):
        return len(self.X)



In [25]:
# models
def get_net(name):
    if (name == 'MNIST') or (name == 'FashionMNIST'):
        return Net1

class Net1(nn.Module):
    def __init__(self):
        super(Net1, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        e1 = F.relu(self.fc1(x))
        x = F.dropout(e1, training=self.training)
        x = self.fc2(x)
        return x, e1

    def get_embedding_dim(self):
        return 50


In [26]:
# strategies
class Strategy:
    def __init__(self, X, Y, idxs_lb, net, handler, args):
        self.X = X
        self.Y = Y
        self.idxs_lb = idxs_lb
        self.net = net
        self.handler = handler
        self.args = args
        self.n_pool = len(Y)
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")

    def query(self, n):
        pass

    def update(self, idxs_lb):
        self.idxs_lb = idxs_lb

    def _train(self, epoch, loader_tr, optimizer):
        self.clf.train()
        for batch_idx, (x, y, idxs) in enumerate(loader_tr):
            x, y = x.to(self.device), y.to(self.device)
            optimizer.zero_grad()
            out, e1 = self.clf(x)
            loss = F.cross_entropy(out, y)
            loss.backward()
            optimizer.step()
            
    def _train_irt(self, epoch, loader_tr, optimizer, alpha, beta, a,  b):
        self.clf.train()
        for batch_idx, (x, y, idxs) in enumerate(loader_tr):
            x, y = x.to(self.device), y.to(self.device)
            optimizer.zero_grad()
            out, e1 = self.clf(x)
            loss = a + (b-a)*F.cross_entropy(alpha*(out-beta), y)
            loss.backward()
            optimizer.step()

    def train(self):
        n_epoch = self.args['n_epoch']
        self.clf = self.net().to(self.device)
        optimizer = optim.SGD(self.clf.parameters(), **self.args['optimizer_args'])

        idxs_train = np.arange(self.n_pool)[self.idxs_lb]
        loader_tr = DataLoader(self.handler(self.X[idxs_train], self.Y[idxs_train], transform=self.args['transform']),
                            shuffle=True, **self.args['loader_tr_args'])

        for epoch in range(1, n_epoch+1):
            self._train(epoch, loader_tr, optimizer)
            
    def train_irt(self, alpha, beta, a,  b):
        n_epoch = self.args['n_epoch']
        self.clf = self.net().to(self.device)
        optimizer = optim.SGD(self.clf.parameters(), **self.args['optimizer_args'])

        idxs_train = np.arange(self.n_pool)[self.idxs_lb]
        loader_tr = DataLoader(self.handler(self.X[idxs_train], self.Y[idxs_train], transform=self.args['transform']),
                            shuffle=True, **self.args['loader_tr_args'])

        for epoch in range(1, n_epoch+1):
            self._train_irt(epoch, loader_tr, optimizer, alpha, beta, a,  b)

    def predict(self, X, Y):
        loader_te = DataLoader(self.handler(X, Y, transform=self.args['transform']),
                            shuffle=False, **self.args['loader_te_args'])

        self.clf.eval()
        P = torch.zeros(len(Y), dtype=Y.dtype)
        with torch.no_grad():
            for x, y, idxs in loader_te:
                x, y = x.to(self.device), y.to(self.device)
                out, e1 = self.clf(x)

                pred = out.max(1)[1]
                P[idxs] = pred.cpu()

        return P

    def predict_prob(self, X, Y):
        loader_te = DataLoader(self.handler(X, Y, transform=self.args['transform']),
                            shuffle=False, **self.args['loader_te_args'])

        self.clf.eval()
        probs = torch.zeros([len(Y), len(np.unique(Y))])
        with torch.no_grad():
            for x, y, idxs in loader_te:
                x, y = x.to(self.device), y.to(self.device)
                out, e1 = self.clf(x)
                prob = F.softmax(out, dim=1)
                probs[idxs] = prob.cpu()
        
        return probs

    def predict_prob_irt(self, X, Y, alpha, beta, a,  b):
        loader_te = DataLoader(self.handler(X, Y, transform=self.args['transform']),
                            shuffle=False, **self.args['loader_te_args'])

        self.clf.eval()
        probs = torch.zeros([len(Y), len(np.unique(Y))])
        with torch.no_grad():
            for x, y, idxs in loader_te:
                x, y = x.to(self.device), y.to(self.device)
                out, e1 = self.clf(x)
                prob = a + (b-a)*F.softmax(alpha*(out-beta), dim=1)
                probs[idxs] = prob.cpu()
        return probs

    def get_embedding(self, X, Y):
        loader_te = DataLoader(self.handler(X, Y, transform=self.args['transform']),
                            shuffle=False, **self.args['loader_te_args'])

        self.clf.eval()
        embedding = torch.zeros([len(Y), self.clf.get_embedding_dim()])
        with torch.no_grad():
            for x, y, idxs in loader_te:
                x, y = x.to(self.device), y.to(self.device)
                out, e1 = self.clf(x)
                embedding[idxs] = e1.cpu()
        
        return embedding

In [27]:

class EntropySampling(Strategy):
	def __init__(self, X, Y, idxs_lb, net, handler, args):
		super(EntropySampling, self).__init__(X, Y, idxs_lb, net, handler, args)

	def query(self, n):
		idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
		probs = self.predict_prob(self.X[idxs_unlabeled], self.Y[idxs_unlabeled])
		log_probs = torch.log(probs)
		U = (probs*log_probs).sum(1)
		return idxs_unlabeled[U.sort()[1][:n]]

class LeastConfidence(Strategy):
	def __init__(self, X, Y, idxs_lb, net, handler, args):
		super(LeastConfidence, self).__init__(X, Y, idxs_lb, net, handler, args)

	def query(self, n):
		idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
		probs = self.predict_prob(self.X[idxs_unlabeled], self.Y[idxs_unlabeled])
		U = probs.max(1)[0]
		return idxs_unlabeled[U.sort()[1][:n]]


class InformationCapacity(Strategy):
	def __init__(self, X, Y, idxs_lb, net, handler, args, alpha=1, beta=0, a=0, b=1):
		super(InformationCapacity, self).__init__(X, Y, idxs_lb, net, handler, args)
		self.alpha = alpha
		self.beta = beta
		self.a = a
		self.b = b

	def query(self, n):
		idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
		probs = self.predict_prob_irt(self.X[idxs_unlabeled], self.Y[idxs_unlabeled], self.alpha, self.beta, self.a, self.b)
		temp1 = (probs-self.a)*self.alpha
		temp2 = (1-probs)*(self.b-self.a)
		temp3 = (probs-self.a)*(1-self.b) 
		temp4 = (self.b-self.a)*(1-self.a)
		d_probs = temp1*(temp2-temp3)/temp4
		J = d_probs**2/(probs*(1-probs))         
		U = J.max(1)[0]
		return idxs_unlabeled[U.sort()[1][:n]]


In [28]:

def run_experiment(strategy_name, NUM_RUN, alpha, beta, a, b):
    res_to_df0 = []
    
    # print info
    print(DATA_NAME)
    print('SEED {}'.format(SEED))

    for run_ in range(1, NUM_RUN+1):
        print('RUN {}'.format(run_))

        # generate initial labeled pool
        idxs_lb = np.zeros(n_pool, dtype=bool)
        idxs_tmp = np.arange(n_pool)
        np.random.shuffle(idxs_tmp)
        idxs_lb[idxs_tmp[:NUM_INIT_LB]] = True
        

        if strategy_name == 'EntropySampling':
            strategy = EntropySampling(X_tr, Y_tr, idxs_lb, net, handler, args)
        elif strategy_name == 'LeastConfidence':
            strategy = LeastConfidence(X_tr, Y_tr, idxs_lb, net, handler, args)
        elif strategy_name == 'InformationCapacity':
            strategy = InformationCapacity(X_tr, Y_tr, idxs_lb, net, handler, args, alpha, beta, a, b)

        print(type(strategy).__name__)
        # round 0 accuracy
             
        strategy.train()
        P = strategy.predict(X_te, Y_te)
        acc = np.zeros(NUM_ROUND+1)
        acc[0] = 1.0 * (Y_te==P).sum().item() / len(Y_te)
        print('Round 0\ntesting accuracy {}'.format(acc[0]))

        res_to_l = []
        for rd in range(1, NUM_ROUND+1):
            print('Round {}'.format(rd))

          # query
            q_idxs = strategy.query(NUM_QUERY)
            idxs_lb[q_idxs] = True

          # update
            strategy.update(idxs_lb)
            
            strategy.train()

          # round accuracy
            P = strategy.predict(X_te, Y_te)
            acc[rd] = 1.0 * (Y_te==P).sum().item() / len(Y_te)
            print('testing accuracy {}'.format(acc[rd]))
            res_to_l.append(acc[rd]) 
        df = pd.DataFrame(res_to_l, columns=[type(strategy).__name__ + '_run_' + str(run_)])
        res_to_df0.append(df)

    print(acc)
    res_to_df0 = pd.concat(res_to_df0, axis=1)
    pd.DataFrame.from_dict(res_to_df0).to_csv(type(strategy).__name__ + '_' + DATA_NAME + '_rnd_' + str(NUM_ROUND) + '_run_' + str(NUM_RUN) + '_lb_' +str(NUM_INIT_LB) + '.csv', index=False)

    return res_to_df0, type(strategy).__name__


# plots
def plot_round(ax, strategy, num_run, num_round, NUM_INIT_LB, DATA_NAME, minx, maxx, miny, maxy, color, color_back, linestyle, label):
    df = pd.read_csv(strategy + '_' + DATA_NAME + '_rnd_' + str(NUM_ROUND) + '_run_' + str(NUM_RUN) + '_lb_' +str(NUM_INIT_LB) + '.csv')
    df_mean = df.mean(axis = 1)
    df_std = df.std(axis = 1)
    
    x_axis = np.linspace(1, num_round, num_round)
    ax.plot(x_axis, df_mean, color = color, linestyle=linestyle, linewidth = 1.25, label = label)
    ax.fill_between(x_axis, df_mean + df_std, df_mean-df_std, edgecolor=color, facecolor=color_back, where = True, interpolate = True, alpha = 0.3)    

    ax.legend(loc = 'lower right', prop={'size': 16})
    ax.grid()
    ax.set_xlim([minx, maxx])
    ax.set_ylim([miny, maxy])
    
    ax.set_xlabel('round', size=16)
    ax.set_ylabel('accuracy', size=16)
    
    # Turn on the minor TICKS, which are required for the minor GRID
    ax.minorticks_on()

    # Customize the major grid
    ax.grid(which='major', linestyle='-', linewidth='0.25', color='k')
    # Customize the minor grid
    #ax.grid(which='minor', linestyle=':', linewidth='0.5', color='black')

  
    ax.set_title('The number of labeled pool = '+str(NUM_INIT_LB), size=18)

# plots
def stats_round(strategy, num_run, num_round, NUM_INIT_LB, DATA_NAME):
    df = pd.read_csv(strategy + '_' + DATA_NAME + '_rnd_' + str(NUM_ROUND) + '_run_' + str(NUM_RUN) + '_lb_' +str(NUM_INIT_LB) + '.csv')
    df_mean = df.mean(axis = 1)
    df_std = df.std(axis = 1)
    
    res_mean = df_mean.mean()
    res_std = df_std.mean()
    
    return res_mean, res_std



In [29]:
# parameters
SEED = 1

NUM_INIT_LB = 100
NUM_QUERY = 100
NUM_ROUND = 50

#DATA_NAME = 'MNIST'
DATA_NAME = 'FashionMNIST'


args_pool = {'MNIST':
                {'n_epoch': 10, 'transform': transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]),
                 'loader_tr_args':{'batch_size': 64, 'num_workers': 1},
                 'loader_te_args':{'batch_size': 1000, 'num_workers': 1},
                 'optimizer_args':{'lr': 0.01, 'momentum': 0.5}},
            'FashionMNIST':
                {'n_epoch': 10, 'transform': transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]),
                 'loader_tr_args':{'batch_size': 64, 'num_workers': 1},
                 'loader_te_args':{'batch_size': 1000, 'num_workers': 1},
                 'optimizer_args':{'lr': 0.01, 'momentum': 0.5}}
            }

args = args_pool[DATA_NAME]

# set seed
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled = False

# load dataset
X_tr_, Y_tr_, X_te, Y_te = get_dataset(DATA_NAME)
X_tr = X_tr_[:10000]
Y_tr = Y_tr_[:10000]

# start experiment
n_pool = len(Y_tr)
n_test = len(Y_te)
print('Number of labeled pool: {}'.format(NUM_INIT_LB))
print('Number of unlabeled pool: {}'.format(n_pool - NUM_INIT_LB))
print('Number of testing pool: {}'.format(n_test))

# load network
net = get_net(DATA_NAME)
handler = get_handler(DATA_NAME)


Number of labeled pool: 100
Number of unlabeled pool: 9900
Number of testing pool: 10000


#### Run Experiments (.csv files)


In [None]:
NUM_RUN = 1
strategy = 'InformationCapacity'

alpha = 0.25
beta = 4
a = 0.1
b = 0.9

res_to_df0, strategy = run_experiment(strategy, NUM_RUN, alpha, beta, a, b)



#### Plots (quantitative results)


In [None]:
strategy1 = 'InformationCapacity'
strategy2 = 'EntropySampling'
strategy3 = 'LeastConfidence'


DATA_NAME = 'MNIST'

fig, axes = plt.subplots(1, 3, figsize=[20,6], squeeze=False)
matplotlib.rc('xtick', labelsize=14)     
matplotlib.rc('ytick', labelsize=14)

plt.style.use('seaborn-dark-palette')

minx = 1
maxx = NUM_ROUND
miny1 = 0.05
miny2 = 0.3
miny3 = 0.65
maxy = 1.0
NUM_RUN = 10
NUM_INIT_LB1 = 100
NUM_INIT_LB2 = 500
NUM_INIT_LB3 = 1000

plot_round(axes[0][0], strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB1, DATA_NAME, minx, maxx, miny1, maxy, '#e41a1c', '#fbb4ae', '-', 'Information capacity')
plot_round(axes[0][0], strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB1, DATA_NAME, minx, maxx, miny1, maxy, '#008837', '#a6dba0', ':', 'Entropy sampling')
plot_round(axes[0][0], strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB1, DATA_NAME, minx, maxx, miny1, maxy, '#377eb8', '#b3cde3', ':', 'Least Confidence')

plot_round(axes[0][1], strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB2, DATA_NAME, minx, maxx, miny2, maxy, '#e41a1c', '#fbb4ae', '-', 'Information capacity')
plot_round(axes[0][1], strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB2, DATA_NAME, minx, maxx, miny2, maxy, '#008837', '#a6dba0', ':', 'Entropy sampling')
plot_round(axes[0][1], strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB2, DATA_NAME, minx, maxx, miny2, maxy, '#377eb8', '#b3cde3', ':', 'Least Confidence')

plot_round(axes[0][2], strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB3, DATA_NAME, minx, maxx, miny3, maxy, '#e41a1c', '#fbb4ae', '-', 'Information capacity')
plot_round(axes[0][2], strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB3, DATA_NAME, minx, maxx, miny3, maxy, '#008837', '#a6dba0', ':', 'Entropy sampling')
plot_round(axes[0][2], strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB3, DATA_NAME, minx, maxx, miny3, maxy, '#377eb8', '#b3cde3', ':', 'Least Confidence')

axes[0][0].text(-0.1, 0.98, '(a)', size=16, horizontalalignment='right', verticalalignment='bottom', transform=axes[0][0].transAxes, )
axes[0][1].text(-0.1, 0.98, '(b)', size=16, horizontalalignment='right', verticalalignment='bottom', transform=axes[0][1].transAxes, )
axes[0][2].text(-0.1, 0.98, '(c)', size=16, horizontalalignment='right', verticalalignment='bottom', transform=axes[0][2].transAxes, )


fig.tight_layout()

#fig.savefig(DATA_NAME + '_result.pdf', format='pdf', dpi=1000, bbox_inches='tight')

fig.show()



In [None]:
strategy1 = 'InformationCapacity'
strategy2 = 'EntropySampling'
strategy3 = 'LeastConfidence'

DATA_NAME = 'FashionMNIST'

fig, axes = plt.subplots(1, 3, figsize=[20,6], squeeze=False)
matplotlib.rc('xtick', labelsize=14)     
matplotlib.rc('ytick', labelsize=14)

plt.style.use('seaborn-dark-palette')

minx = 1
maxx = NUM_ROUND
miny1 = 0.05
miny2 = 0.25
miny3 = 0.5
maxy = 0.8
NUM_RUN = 10
NUM_INIT_LB1 = 100
NUM_INIT_LB2 = 500
NUM_INIT_LB3 = 1000

plot_round(axes[0][0], strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB1, DATA_NAME, minx, maxx, miny1, maxy, '#e41a1c', '#fbb4ae', '-', 'Information capacity')
plot_round(axes[0][0], strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB1, DATA_NAME, minx, maxx, miny1, maxy, '#008837', '#a6dba0', ':', 'Entropy sampling')
plot_round(axes[0][0], strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB1, DATA_NAME, minx, maxx, miny1, maxy, '#377eb8', '#b3cde3', ':', 'Least Confidence')

plot_round(axes[0][1], strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB2, DATA_NAME, minx, maxx, miny2, maxy, '#e41a1c', '#fbb4ae', '-', 'Information capacity')
plot_round(axes[0][1], strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB2, DATA_NAME, minx, maxx, miny2, maxy, '#008837', '#a6dba0', ':', 'Entropy sampling')
plot_round(axes[0][1], strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB2, DATA_NAME, minx, maxx, miny2, maxy, '#377eb8', '#b3cde3', ':', 'Least Confidence')

plot_round(axes[0][2], strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB3, DATA_NAME, minx, maxx, miny3, maxy, '#e41a1c', '#fbb4ae', '-', 'Information capacity')
plot_round(axes[0][2], strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB3, DATA_NAME, minx, maxx, miny3, maxy, '#008837', '#a6dba0', ':', 'Entropy sampling')
plot_round(axes[0][2], strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB3, DATA_NAME, minx, maxx, miny3, maxy, '#377eb8', '#b3cde3', ':', 'Least Confidence')


axes[0][0].text(-0.1, 0.98, '(a)', size=16, horizontalalignment='right', verticalalignment='bottom', transform=axes[0][0].transAxes, )
axes[0][1].text(-0.1, 0.98, '(b)', size=16, horizontalalignment='right', verticalalignment='bottom', transform=axes[0][1].transAxes, )
axes[0][2].text(-0.1, 0.98, '(c)', size=16, horizontalalignment='right', verticalalignment='bottom', transform=axes[0][2].transAxes, )


fig.tight_layout()

#fig.savefig(DATA_NAME + '_result.pdf', format='pdf', dpi=1000, bbox_inches='tight')

fig.show()



#### Accuracy values over rounds (quantitative results)

In [30]:
NUM_INIT_LB = 1000
NUM_ROUND = 50
NUM_RUN = 10

strategy1 = 'InformationCapacity'
strategy2 = 'EntropySampling'
strategy3 = 'LeastConfidence'


DATA_NAME = 'MNIST'

# Information Capacity
res_mean, res_std = stats_round(strategy1, NUM_RUN, NUM_ROUND, NUM_INIT_LB, DATA_NAME)
df_IC_mean = res_mean
df_IC_std = res_std
print("Information Capacity:")
print(res_mean, res_std)

# Entropy Sampling
res_mean, res_std = stats_round(strategy2, NUM_RUN, NUM_ROUND, NUM_INIT_LB, DATA_NAME)
df_ES_mean = res_mean
df_ES_std = res_std
print("Entropy Sampling:")
print(res_mean, res_std)

# Least Confidence
res_mean, res_std = stats_round(strategy3, NUM_RUN, NUM_ROUND, NUM_INIT_LB, DATA_NAME)
df_LC_mean = res_mean
df_LC_std = res_std
print("Least Confidence:")
print(res_mean, res_std)

Information Capacity:
0.9152993999999999 0.011759046712358476
Entropy Sampling:
0.9133281999999998 0.014076564919604784
Least Confidence:
0.9156454 0.012552382318871598


#### Test statistics (quantitative results)

In [33]:
from scipy.stats import wilcoxon

NUM_INIT_LB = 100
NUM_ROUND = 50
NUM_RUN = 10

strategy1 = 'InformationCapacity'
strategy2 = 'EntropySampling'
strategy3 = 'LeastConfidence'


DATA_NAME = 'MNIST'

df1 = pd.read_csv(strategy1 + '_' + DATA_NAME + '_rnd_' + str(NUM_ROUND) + '_run_' + str(NUM_RUN) + '_lb_' +str(NUM_INIT_LB) + '.csv')
df2 = pd.read_csv(strategy2 + '_' + DATA_NAME + '_rnd_' + str(NUM_ROUND) + '_run_' + str(NUM_RUN) + '_lb_' +str(NUM_INIT_LB) + '.csv')
df3 = pd.read_csv(strategy3 + '_' + DATA_NAME + '_rnd_' + str(NUM_ROUND) + '_run_' + str(NUM_RUN) + '_lb_' +str(NUM_INIT_LB) + '.csv')



df1_mean = df1.mean(axis = 1)
df2_mean = df2.mean(axis = 1)
df3_mean = df3.mean(axis = 1)
df1_std = df1.std(axis = 1)
df2_std = df2.std(axis = 1)
df3_std = df3.std(axis = 1)

In [56]:
# alternative options: less or greater

w, p = wilcoxon(df1_mean, df2_mean, alternative='less', correction=True)
w, p

(942.0, 0.9983814874703646)