In [38]:
from tpbg import TPBG
from sklearn.datasets import fetch_20newsgroups
from util import SimplePreprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import numpy as np

In [39]:
# loading 20 newsgroup dataset
categories = None
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

pp = SimplePreprocessing()
x_train = pp.transform(data_train.data)
x_test = pp.transform(data_test.data)
vectorize = TfidfVectorizer()
x_train = vectorize.fit_transform(x_train)
x_test = vectorize.transform(x_test)
y_train = data_train.target
y_test = data_test.target

In [40]:
# create unlabeled set
n = len(y_train)
n_unlabeled = int(n * 0.7)
indices = np.arange(n)
np.random.shuffle(indices)
unlabeled_set = indices[n-n_unlabeled:]
print(f" {len(unlabeled_set)} unlabeled docs from {n} total docs")
# -1 indicate that doc is unlabeled
y_train_real = y_train.copy()
y_train[unlabeled_set] = -1

 3395 unlabeled docs from 11314 total docs


In [41]:
def eval(self):        
    self.create_transduction()    
    y_predicted = self.transduction_[unlabeled_set]    
    y_real = y_train_real[unlabeled_set]    
    print(classification_report(y_predicted, y_real))

In [44]:
k=20
tpbg = TPBG(k, alpha=0.05, beta=0.0001, local_max_itr=5,
                 global_max_itr=5, local_threshold=1e-6, global_threshold=1e-6,
                 save_interval=-1, feature_names=vectorize.get_feature_names(), target_name=data_train.target_names, 
                 silence=False, eval_func=eval)

In [45]:
tpbg.fit(x_train,y_train)

initialing.[]:   : 100%|##########| 59181/59181 [00:41<00:00, 1409.25it/s]
docs processed (itr 0): 100%|##########| 11314/11314 [00:27<00:00, 411.97it/s]


topic 0 [alt.atheism] people, think, like, know, atheist, could, religion, atheism, time, thing
topic 1 [comp.graphics] graphic, file, image, thanks, know, program, anyone, format, like, color
topic 2 [comp.os.ms-windows.misc] window, file, driver, problem, know, thanks, program, anyone, font, card
topic 3 [comp.sys.ibm.pc.hardware] drive, card, scsi, controller, system, monitor, thanks, know, disk, problem
topic 4 [comp.sys.mac.hardware] apple, drive, problem, card, know, simms, thanks, like, monitor, anyone
topic 5 [comp.windows.x] window, server, widget, motif, application, thanks, file, program, display, know
topic 6 [misc.forsale] offer, sale, shipping, please, drive, price, email, condition, interested, like
topic 7 [rec.autos] car, like, know, good, think, people, also, engine, anyone, time
topic 8 [rec.motorcycles] bike, motorcycle, like, know, ride, think, good, time, helmet, well
topic 9 [rec.sport.baseball] game, year, team, player, baseball, think, run, like, pitcher, know


docs processed (itr 1):   0%|          | 32/11314 [00:00<00:35, 314.80it/s]

              precision    recall  f1-score   support

           0       0.25      0.24      0.25       165
           1       0.29      0.27      0.28       166
           2       0.30      0.31      0.30       166
           3       0.39      0.29      0.33       234
           4       0.27      0.24      0.25       181
           5       0.42      0.39      0.41       194
           6       0.36      0.32      0.34       186
           7       0.22      0.27      0.24       152
           8       0.36      0.41      0.38       175
           9       0.47      0.43      0.45       173
          10       0.50      0.49      0.50       196
          11       0.36      0.42      0.39       172
          12       0.26      0.27      0.27       175
          13       0.28      0.36      0.32       132
          14       0.28      0.31      0.29       147
          15       0.31      0.28      0.30       193
          16       0.25      0.26      0.26       165
          17       0.46    

docs processed (itr 1): 100%|##########| 11314/11314 [00:25<00:00, 442.57it/s]
docs processed (itr 2):   0%|          | 0/11314 [00:00<?, ?it/s]

topic 0 [alt.atheism] people, atheist, religion, think, atheism, could, islam, thing, argument, moral
topic 1 [comp.graphics] graphic, file, image, thanks, program, format, know, color, anyone, looking
topic 2 [comp.os.ms-windows.misc] window, file, driver, problem, font, program, thanks, know, anyone, version
topic 3 [comp.sys.ibm.pc.hardware] drive, card, scsi, controller, monitor, disk, system, thanks, problem, port
topic 4 [comp.sys.mac.hardware] apple, drive, problem, simms, card, monitor, thanks, quadra, know, anyone
topic 5 [comp.windows.x] window, server, motif, widget, application, display, thanks, program, file, client
topic 6 [misc.forsale] sale, offer, shipping, price, condition, email, please, interested, asking, sell
topic 7 [rec.autos] car, engine, like, auto, dealer, ford, good, know, think, price
topic 8 [rec.motorcycles] bike, motorcycle, ride, helmet, like, riding, rider, know, good, well
topic 9 [rec.sport.baseball] game, year, team, player, baseball, run, last, thi

docs processed (itr 2): 100%|##########| 11314/11314 [00:23<00:00, 479.01it/s]
docs processed (itr 3):   0%|          | 0/11314 [00:00<?, ?it/s]

topic 0 [alt.atheism] people, atheist, religion, think, atheism, argument, could, moral, thing, islam
topic 1 [comp.graphics] graphic, file, image, thanks, program, format, know, looking, color, anyone
topic 2 [comp.os.ms-windows.misc] window, file, driver, problem, program, thanks, font, version, anyone, using
topic 3 [comp.sys.ibm.pc.hardware] drive, card, scsi, controller, disk, monitor, system, thanks, board, port
topic 4 [comp.sys.mac.hardware] apple, drive, problem, simms, monitor, card, thanks, know, anyone, quadra
topic 5 [comp.windows.x] window, server, motif, widget, application, display, thanks, program, using, client
topic 6 [misc.forsale] sale, offer, shipping, price, condition, email, please, sell, interested, asking
topic 7 [rec.autos] car, engine, like, auto, dealer, ford, good, model, price, know
topic 8 [rec.motorcycles] bike, motorcycle, ride, helmet, riding, like, rider, road, good, know
topic 9 [rec.sport.baseball] game, year, team, player, baseball, run, last, thi

docs processed (itr 3): 100%|##########| 11314/11314 [00:25<00:00, 446.76it/s]


topic 0 [alt.atheism] people, think, religion, atheist, atheism, argument, could, moral, thing, said
topic 1 [comp.graphics] graphic, image, file, thanks, program, format, looking, know, color, anyone
topic 2 [comp.os.ms-windows.misc] window, file, driver, problem, program, thanks, version, font, using, anyone
topic 3 [comp.sys.ibm.pc.hardware] drive, card, scsi, controller, disk, monitor, system, thanks, board, port
topic 4 [comp.sys.mac.hardware] apple, drive, problem, monitor, simms, card, thanks, anyone, know, machine
topic 5 [comp.windows.x] window, server, motif, application, widget, display, thanks, program, using, client
topic 6 [misc.forsale] sale, offer, shipping, price, condition, please, email, interested, sell, asking
topic 7 [rec.autos] car, engine, like, dealer, auto, ford, good, model, price, look
topic 8 [rec.motorcycles] bike, motorcycle, ride, helmet, like, riding, rider, road, right, good
topic 9 [rec.sport.baseball] year, game, team, player, baseball, last, run, th

docs processed (itr 4):   0%|          | 33/11314 [00:00<00:34, 328.59it/s]

              precision    recall  f1-score   support

           0       0.60      0.63      0.61       153
           1       0.66      0.68      0.67       149
           2       0.60      0.63      0.61       163
           3       0.69      0.59      0.64       200
           4       0.63      0.68      0.65       150
           5       0.81      0.76      0.79       191
           6       0.79      0.68      0.73       192
           7       0.67      0.76      0.71       164
           8       0.72      0.81      0.76       175
           9       0.85      0.81      0.83       166
          10       0.86      0.84      0.85       197
          11       0.74      0.88      0.80       171
          12       0.70      0.75      0.72       175
          13       0.85      0.86      0.86       168
          14       0.78      0.80      0.79       156
          15       0.80      0.62      0.70       224
          16       0.79      0.70      0.74       192
          17       0.84    

docs processed (itr 4): 100%|##########| 11314/11314 [00:24<00:00, 453.39it/s]


topic 0 [alt.atheism] people, think, religion, atheist, argument, atheism, could, thing, statement, moral
topic 1 [comp.graphics] graphic, image, file, thanks, program, format, looking, know, anyone, color
topic 2 [comp.os.ms-windows.misc] window, file, driver, problem, version, program, thanks, font, using, anyone
topic 3 [comp.sys.ibm.pc.hardware] drive, card, scsi, controller, disk, system, monitor, board, thanks, port
topic 4 [comp.sys.mac.hardware] apple, problem, drive, monitor, simms, card, anyone, thanks, know, machine
topic 5 [comp.windows.x] window, server, motif, application, widget, display, thanks, program, using, running
topic 6 [misc.forsale] sale, offer, shipping, price, please, condition, email, interested, sell, asking
topic 7 [rec.autos] car, engine, like, dealer, auto, good, ford, model, look, price
topic 8 [rec.motorcycles] bike, motorcycle, ride, helmet, like, riding, rider, road, right, front
topic 9 [rec.sport.baseball] year, game, team, player, baseball, last, 



TPBG(eval_func=<function eval at 0x7f3de7926680>,
     feature_names=['aaaaaaaaaaaa',
                    'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaauuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuugggggggggggggggg',
                    'aaah', 'aaahh', 'aaahhhh', 'aaai', 'aacc', 'aachen',
                    'aacvkc', 'aaef', 'aalac', 'aalm', 'aalternate', 'aamazing',
                    'aamir', 'aammmaaaazzzzzziinnnnggggg', 'aamrl', 'aanbieden',
                    'aanerud', 'aangeboden', 'aangegev...
     target_name=['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
                  'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                  'comp.windows.x', 'misc.forsale', 'rec.autos',
                  'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey',
                  'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
                  'soc.religion.christian', 'talk.politics.guns',
                  'talk.politics.mideast', 'talk.politics.misc',
                  't