In [None]:
import numpy as np
import random
import sklearn
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import data
from algs import *

In [None]:
dataset_name = '20newsgroups'
class_0 = ['rec.motorcycles']
class_1 = ['sci.space']

categories = sorted(class_0 + class_1)

sparsity_b1 = 0.03
sparsity_b2 = 0.002
num_words = 200

expansion = 'l'
n_cval = 5
random.seed(0)

In [None]:
print('--- Load the dataset. ---')
if dataset_name == '20newsgroups':
    dataset = data.Text20News(subset='all', categories=categories, remove=(), shuffle=True, random_state=42)
else:  # elif dataset_name == 'rcv1':
    dataset = data.TextRCV1(data_dir='./data/RCV1', subset='all', categories=categories)

print('\n--- Transform text to a-z (lowercase) and (single) whitespace. ---')
dataset.clean_text(num='substitute')

print('\n-- Count words. ---')
dataset.vectorize(stop_words='english')

print('\n--- Remove documents containing less than 20 words. ---')
dataset.remove_short_documents(nwords=20, vocab='full')

print('\n--- Remove documents containing images. ---')
dataset.remove_encoded_images()
    
print('\n--- Remove words appearing in more than {} percent and less than {} percent documents. ---'.format(sparsity_b1*100, sparsity_b2*100))
dataset.remove_frequent_words(sparsity_b1=sparsity_b1, sparsity_b2=sparsity_b2)

print('\n--- Keep top ' + str(num_words) + ' frequent words. ---')
dataset.keep_top_words(num_words, 10)

print('\n--- Remove documents containing less than 1 (selected) words. ---')
dataset.remove_short_documents(nwords=1, vocab='selected')

print('\n--- Compute tf-dif. ---\n')
dataset.compute_tfidf()

dataset.data_info(show_classes=True)

tfidf = dataset.tfidf.astype(np.float32).T.toarray()  # size: (num of words) x (num of documents)
n_e, n_v = np.shape(tfidf) 

In [None]:
index2class = {i: dataset.class_names[i] for i in range(len(dataset.class_names))}
true_classes = []
for i in dataset.labels:
    if index2class[i] in class_0:
        true_classes.append(0)
    else:  # class_1
        true_classes.append(1) 
true_classes = np.array(true_classes)

c0_index = list(np.where(true_classes == 0)[0])
c1_index = list(np.where(true_classes == 1)[0])
c0_size = len(c0_index)
c1_size = len(c1_index)

In [None]:
alpha = 0.15

para_p_list = np.arange(0, 5.2, 0.2)
n_p = len(para_p_list)

train_ratio_list = np.arange(0.1, 0.8, 0.1)
n_tr = len(train_ratio_list)

n_iter = 10
acc = np.zeros((n_tr, n_iter))
acc0 = np.zeros((n_tr, n_iter))  # cardinality-based, p=0

for i_tr in range(n_tr):  # for different train ratio
    print('i_tr =', i_tr)
    train_ratio = train_ratio_list[i_tr]
    
    for i_iter in range(n_iter):  # for different realizations
        c0_labeled, c1_labeled, unlabeled = train_test_split(c0_index, c1_index, c0_size, c1_size, n_v, train_ratio)
        c0_labeled_split = list(map(list, np.array_split(c0_labeled, n_cval)))  
        c1_labeled_split = list(map(list, np.array_split(c1_labeled, n_cval))) 
        
        acc_temp = np.zeros((n_cval, n_p))
        for i_cval in range(n_cval):  # for cross validation
            c0_train, c1_train, total_val = train_val_split(c0_labeled_split, c1_labeled_split, n_cval, i_cval)
            for i_p in range(n_p):  # for different p
                para_p = para_p_list[i_p]
                incidence_list, parameter_list = build_hg(tfidf, para_p)
                pred_classes = classification(incidence_list, parameter_list, n_v, n_e, c0_train, c1_train, expansion, alpha)
                acc_temp[i_cval, i_p] = accuracy_score(true_classes[total_val], pred_classes[total_val])

        # 1 
        p_opt = para_p_list[np.argmax(np.mean(acc_temp, 0))]
        incidence_list, parameter_list = build_hg(tfidf, p_opt)
        pred_classes = classification(incidence_list, parameter_list, n_v, n_e, c0_labeled, c1_labeled, expansion, alpha)
        acc[i_tr, i_iter] = accuracy_score(true_classes[unlabeled], pred_classes[unlabeled])
    
        # 2
        incidence_list, parameter_list = build_hg(tfidf, 0)
        pred_classes = classification(incidence_list, parameter_list, n_v, n_e, c0_labeled, c1_labeled, expansion, alpha)
        acc0[i_tr, i_iter] = accuracy_score(true_classes[unlabeled], pred_classes[unlabeled])

file_name = 'results/'+dataset_name+'/'+expansion+'_'+categories[0]+'_'+categories[1]+'_'+str(sparsity_b1)+'_'+str(num_words)
np.save(file_name+'_p_edvw', acc)
np.save(file_name+'_p_card', acc0)
    

In [None]:
x = train_ratio_list
y0 = np.mean(acc0, 1)
y0_err = np.std(acc0, 1)
y = np.mean(acc, 1)
y_err = np.std(acc, 1)

plt.figure()
myfig = plt.gcf()

plt.errorbar(x, y0, y0_err)
plt.errorbar(x, y, y_err)
plt.xlabel('Train ratio')
plt.ylabel('Classification accuracy')