In [1]:
import numpy as np
from scipy.io import savemat
import data

In [2]:
# we consider two classes
class_0 = ['rec.motorcycles']
class_1 = ['rec.sport.hockey']
categories = sorted(class_0 + class_1)

sparsity_b1 = 0.1
sparsity_b2 = 0.002
num_words = 100

In [None]:
print('--- Load the dataset. ---')
dataset = data.Text20News(subset='all', categories=categories, remove=(), shuffle=True, random_state=42)

print('\n--- Transform text to a-z (lowercase) and (single) whitespace. ---')
dataset.clean_text(num='substitute')

print('\n-- Count words. ---')
dataset.vectorize(stop_words='english')

print('\n--- Remove documents containing less than 20 words. ---')
dataset.remove_short_documents(nwords=20, vocab='full')

print('\n--- Remove documents containing images. ---')
dataset.remove_encoded_images()
    
print('\n--- Remove words appearing in more than {} percent and less than {} percent documents. ---'.format(sparsity_b1*100, sparsity_b2*100))
dataset.remove_frequent_words(sparsity_b1=sparsity_b1, sparsity_b2=sparsity_b2)

print('\n--- Keep top ' + str(num_words) + ' frequent words. ---')
dataset.keep_top_words(num_words, 10)

print('\n--- Remove documents containing less than 5 (selected) words. ---')
dataset.remove_short_documents(nwords=5, vocab='selected')

print('\n--- Compute tf-dif. ---')
dataset.compute_tfidf()

dataset.data_info(show_classes=True)

tfidf = dataset.tfidf.astype(float).T.toarray()  # size: (num of words) x (num of documents)

In [None]:
# convert into 2 classes
index2class = {i: dataset.class_names[i] for i in range(len(dataset.class_names))}
true_classes = []
for i in dataset.labels:
    if index2class[i] in class_0:
        true_classes.append(0)
    else:  # class_1
        true_classes.append(1) 
true_classes = np.array(true_classes)

In [None]:
data_mat = {'R': tfidf, 'y': true_classes}
savemat('20news_10.mat', data_mat)