In [None]:
import numpy as np
import random
import sklearn
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import data
from algs import *

In [None]:
dataset_name = '20newsgroups'
class_0 = ['rec.motorcycles']
class_1 = ['sci.space']

categories = sorted(class_0 + class_1)

sparsity_b1 = 0.03
sparsity_b2 = 0.002
num_words = 200

expansion = 'l'
train_ratio = 0.7
random.seed(0)

In [None]:
print('--- Load the dataset. ---')
if dataset_name == '20newsgroups':
    dataset = data.Text20News(subset='all', categories=categories, remove=(), shuffle=True, random_state=42)
else:  # elif dataset_name == 'rcv1':
    dataset = data.TextRCV1(data_dir='./data/RCV1', subset='all', categories=categories)

print('\n--- Transform text to a-z (lowercase) and (single) whitespace. ---')
dataset.clean_text(num='substitute')

print('\n-- Count words. ---')
dataset.vectorize(stop_words='english')

print('\n--- Remove documents containing less than 20 words. ---')
dataset.remove_short_documents(nwords=20, vocab='full')

print('\n--- Remove documents containing images. ---')
dataset.remove_encoded_images()
    
print('\n--- Remove words appearing in more than {} percent and less than {} percent documents. ---'.format(sparsity_b1*100, sparsity_b2*100))
dataset.remove_frequent_words(sparsity_b1=sparsity_b1, sparsity_b2=sparsity_b2)

print('\n--- Keep top ' + str(num_words) + ' frequent words. ---')
dataset.keep_top_words(num_words, 10)

print('\n--- Remove documents containing less than 1 (selected) words. ---')
dataset.remove_short_documents(nwords=1, vocab='selected')

print('\n--- Compute tf-dif. ---\n')
dataset.compute_tfidf()

dataset.data_info(show_classes=True)

tfidf = dataset.tfidf.astype(np.float32).T.toarray()  # size: (num of words) x (num of documents)
n_e, n_v = np.shape(tfidf) 

In [None]:
# convert into 2 classes
index2class = {i: dataset.class_names[i] for i in range(len(dataset.class_names))}
true_classes = []
for i in dataset.labels:
    if index2class[i] in class_0:
        true_classes.append(0)
    else:  # class_1
        true_classes.append(1) 
true_classes = np.array(true_classes)

c0_index = list(np.where(true_classes == 0)[0])
c1_index = list(np.where(true_classes == 1)[0])
c0_size = len(c0_index)
c1_size = len(c1_index)

In [None]:
para_p = 1
incidence_list, parameter_list = build_hg(tfidf, para_p)

b = []
for e_i in range(n_e):
    edvws = parameter_list[e_i]
    b.append(np.min(edvws)/np.sum(edvws))
    
min(b)

In [None]:
alpha_list = np.logspace(-4, 0, 25)[3:-2]
n_alpha = len(alpha_list)

print(alpha_list)

In [None]:
n_iter = 10
acc = np.zeros((n_iter, n_alpha))

for i_iter in range(n_iter):  # for different realizations
    print('i_iter =', i_iter)        
    c0_labeled, c1_labeled, unlabeled = train_test_split(c0_index, c1_index, c0_size, c1_size, n_v, train_ratio)

#     pred_classes = classification(incidence_list, parameter_list, n_v, n_e, c0_labeled, c1_labeled, 'h')        
#     acc_h = accuracy_score(true_classes[unlabeled], pred_classes[unlabeled])
#     print(acc_h)
    
    for i_alpha in range(n_alpha):  # for different alpha
        alpha = alpha_list[i_alpha]
        pred_classes = classification(incidence_list, parameter_list, n_v, n_e, c0_labeled, c1_labeled, expansion, alpha)        
        acc[i_iter, i_alpha] = accuracy_score(true_classes[unlabeled], pred_classes[unlabeled])
        print(acc[i_iter, i_alpha])
            
# file_name = 'results/'+dataset_name+'/'+expansion+'_'+str(train_ratio)+'_'+categories[0]+'_'+categories[1]+'_'+str(sparsity_b1)+'_'+str(num_words)
# np.save(file_name+'_alpha', acc)


In [None]:
plt.figure()
myfig = plt.gcf()

x = alpha_list

y = np.mean(acc, 0)
yerr = np.std(acc, 0)
plt.errorbar(x, y, yerr)
plt.xlabel('alpha')
plt.ylabel('Classification accuracy')
plt.xscale('log')
plt.grid()

# fig_name = 'results/'+dataset_name+'/temp/'+expansion+'_'+str(train_ratio)+'_'+categories[0]+'_'+categories[1]+'_'+str(sparsity_b1)+'_'+str(num_words)
# myfig.savefig(fig_name+'_alpha.png')
