In [18]:
# All code from Tutorial:
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

# Part 1 : gensim LDA based on NLTK & SpaCy

# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tharsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [20]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


In [21]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']
['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 't

In [22]:
%%time

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])
print('\n')

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


CPU times: user 10.5 s, sys

In [23]:
%%time

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print('\n')

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


CPU times: user 46.3 s, sys: 486 ms, total: 46.8 s
Wa

In [24]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [25]:
%%time
# This cell takes 2-3 minutes to run on my machine.  -j

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print('\n')

[['where', 's', 'thing', 'car', 'nntp_post', 'host', 'rac_wam', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'whatev', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


CPU times: user 7min 20s, sys: 1min 19s, total: 8min 39s
Wall time: 2min 25s


In [26]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)]]


In [27]:
mywords = []
for cp in corpus:
    for id, freq in cp:
        if id2word[id].find("_") != -1:
            mywords.append(id2word[id]) 
print(len(mywords))
print('\n')
pprint(mywords)

70469


['front_bumper',
 'maryland_college',
 'nntp_post',
 'rac_wam',
 'nntp_post',
 'carson_washington',
 'floppy_disk',
 'guy_kuo',
 'heat_sink',
 'oil_leak',
 'richardson_tx',
 'nntp_post',
 'developers_toolkit',
 'distribution_usa',
 'host_magnusug',
 'magnus_ac',
 'ohio_state',
 'delivered_gateway',
 'diamond_ss',
 'latest_driver',
 'mouse_cursor',
 'nntp_post',
 'automatic_transmission',
 'chris_silvester',
 'james_callison',
 'texas_instrument',
 'nntp_post',
 'ohio_state',
 'acs_ohio',
 'asking_shipping',
 'bottom_magnus',
 'nntp_post',
 'evanston_illinois',
 'northwestern_university',
 'bob_beauchaine',
 'bronx_away',
 'death_penalty',
 'innocents_die',
 'queens_could',
 'robert_beauchaine',
 'sank_manhattan',
 'success_rate',
 'nntp_post',
 'distribution_world',
 'frank_odwyer',
 'host_solntze',
 'jon_livesey',
 'wpd_sgi',
 'nntp_post',
 'dsto_gov',
 'tv_station',
 'west_coast',
 'years_ago',
 'bob_gaj',
 'bob_gajarsky',
 'curtis_jackson',
 'flame_war',
 'lotsa_point',
 'mo

 'distribution_world',
 'scott_roby',
 'apr_gmt',
 'news_reader',
 'administrative_computing',
 'batf_fbi',
 'conspiracy_theorie',
 'emmet_gil',
 'involve_complicat',
 'levine_triumph',
 'man_rik',
 'murders_almost',
 'northern_ireland',
 'student_billing',
 'unlikely_scenarios',
 'waco_today',
 'computing_center',
 'human_right',
 'asia_minor',
 'bosnia_serbia',
 'equal_rights',
 'greece_armenia',
 'ither_alway',
 'mr_tamamidis',
 'ottoman_empire',
 'territorial_water',
 'th_century',
 'turkish_minority',
 'visa_requirement',
 'nntp_post',
 'san_jose',
 'ken_snyder',
 'posters_view',
 'rchland_ibm',
 'nntp_post',
 'electrical_engineer',
 'distribution_na',
 'computer_science',
 'electrical_engineer',
 'cd_rom',
 'distribution_usa',
 '_',
 'ak_q',
 'c__c',
 'c__cx',
 'c__sc',
 'c__scx',
 'ck_chz',
 'ck_ri',
 'cscx_sy',
 'csyx_g',
 'cx_c',
 'cx_cx',
 'cx_cx_s',
 'cx_cx_scx',
 'cx_g',
 'cx_gc',
 'cx_gcucs',
 'cx_gcx',
 'cx_gcx_s',
 'cx_gyx_s',
 'cx_ocx',
 'cx_s',
 'cx_sc',
 'cx_sct',
 'c

 'chinese_food',
 'chinese_restaurant',
 'doug_bank',
 'msg_sensitivity',
 'pulled_muscle',
 'double_blind',
 'advisory_panel',
 'anecdotal_evidence',
 'glutamate_sensitivity',
 'land_mobile',
 'msg_monosodium',
 'products_sector',
 'sector_schaumburg',
 'greatly_appreciat',
 'contact_lens',
 'nntp_post',
 'real_estate',
 'bell_northern',
 'amir_rosenblatt',
 'sam_zbib',
 'newsreader_tin',
 'works_fine',
 'bowling_green',
 'george_pavlic',
 'lori_iannamico',
 'radio_station',
 'ted_simmon',
 'purdue_university',
 'jon_ogden',
 'timothy_brent',
 'mathematical_science',
 'ive_seen',
 'jim_frost',
 'dumbest_automotive',
 'access_unix',
 'itc_uva',
 'stuff_delet',
 'distribution_usa',
 'excellent_condition',
 'public_domain',
 'regression_test',
 'nntp_post',
 'ati_ultra',
 'gateway_dx',
 'hard_drive',
 'ati_gup',
 'color_palette',
 'keith_mancus',
 'palo_alto',
 'distribution_usa',
 'chicago_academic',
 'american_express',
 'andy_freeman',
 'carry_conceal',
 'carry_concealed',
 'concealed

 'hard_disk',
 'hard_drive',
 'nasa_ame',
 'maxtor_lxt',
 'opinions_express',
 'georgia_institute',
 'nntp_post',
 'floppy_drive',
 'nntp_post',
 'sgi_com',
 'silicon_graphic',
 'wait_til',
 'jim_lefebvre',
 'richard_casare',
 'nntp_post',
 'les_bartel',
 'urbana_champaign',
 'fast_polygon',
 'lucas_adamski',
 'nntp_post',
 'computer_science',
 'ftp_site',
 'public_domain',
 'content_type',
 'frequently_ask',
 'ms_do',
 'archive_name',
 'sci_crypt',
 'marc_vanheyningen',
 'silver_ucs',
 'fri_apr',
 'getting_access',
 'gnu_emac',
 'mark_riordan',
 'permanent_resident',
 'privacy_enhanc',
 'readme_file',
 'universally_accept',
 'users_guide',
 'via_anonymous',
 'cl_msu',
 'cryptographic_technique',
 'misc_legal',
 'privacy_enhanced',
 'nntp_post',
 'greatly_appreciat',
 'same_screen',
 'workspace_manager',
 'nntp_post',
 'low_level',
 'ether_twist',
 'old_fashion',
 '_',
 'gamma_ray',
 'distribution_usa',
 'bell_lab',
 'ronald_deblock',
 'netcom_online',
 'anywhere_near',
 'mark_singer',

 'united_stat',
 'sexual_behavior',
 'washington_dc',
 'denver_dept',
 'health_care',
 'volume_number',
 'adult_safety',
 'associate_professor',
 'birth_defect',
 'cdc_surveillance',
 'child_restraint',
 'david_dodell',
 'dhhs_publication',
 'gene_pool',
 'health_science',
 'hicnet_medical',
 'ivf_et',
 'medical_newsletter',
 'metropolitan_area',
 'mmwr_date',
 'newsletter_page',
 'safety_belt',
 'tucson_arizona',
 'year_olds',
 'assault_weapon',
 'nntp_post',
 'video_card',
 'works_fine',
 'stand_alone',
 'craig_williamson',
 'apr_gmt',
 'news_reader',
 'bobby_mozumder',
 'mats_andtbacka',
 'postings_uninc',
 'unorganized_usenet',
 'nntp_post',
 'greatly_appreciat',
 'ncar_boulder',
 'donald_boell',
 'nntp_post',
 'distribution_world',
 'plus_shipp',
 'hard_drive',
 'login_gu',
 'best_off',
 'access_unix',
 'tape_backup',
 'ethernet_card',
 'nntp_post',
 'half_hour',
 'andy_woodward',
 'wales_aberystwyth',
 'nntp_post',
 'distribution_inet',
 'instruction_set',
 'computing_centre',
 '

 'regional_studie',
 'turkish_mp',
 'turkish_troop',
 'communist_party',
 'accounts__edit',
 'aristide_caratza',
 'armored_personnel',
 'blue_collar',
 'city_party',
 'emergency_medical',
 'fourth_floor',
 'maternity_home',
 'movie_theater',
 'nagorno_karabakh',
 'police_precinct',
 'samuel_shahmuradian',
 'sk_club',
 'sumgait_tragedy',
 'third_entryway',
 'volume_eyewitness',
 'whats_happening',
 'yelena_bonner',
 'yerevan_reference',
 'nntp_post',
 'new_york',
 'uucp_uunet',
 'horror_storie',
 'government_agencie',
 'ann_arbor',
 'steve_hendrick',
 'health_care',
 'distribution_usa',
 'radio_shack',
 'wisconsin_madison',
 'joel_kolstad',
 'nntp_post',
 'case_western',
 'cleveland_oh',
 'host_hela',
 'ins_cwru',
 'reserve_university',
 'gerard_pinzone',
 'laser_printer',
 'bell_laboratorie',
 'robert_nichol',
 'truetype_font',
 'nntp_post',
 'apr_gmt',
 'case_western',
 'ins_cwru',
 'reserve_university',
 'cleveland_ohio',
 'host_slc',
 'realistic_pro',
 'nntp_post',
 'politically_cor

 'uak_d',
 'ub_q',
 'ucscx_gy',
 'uw_ew',
 'uw_uwt',
 'uw_wt',
 'uwt_ww',
 'uww_uw',
 'v_bqt',
 'wa_uww',
 'wt_ww',
 'ww_uwa',
 'nl_east',
 'tue_apr',
 'ian_derby',
 'virginia_commonwealth',
 'nntp_post',
 'czech_republic',
 'muenchen_germany',
 'staffan_axelsson',
 'holger_ohlwein',
 'good_luck',
 'tape_backup',
 'vram_simms',
 'nntp_post',
 'years_ago',
 'computer_science',
 'ergonomic_keyboard',
 'virginia_tech',
 'dept_blacksburg',
 'csugrad_c',
 'vice_versa',
 'newsreader_tin',
 'scsi_controll',
 'hewlett_packard',
 'works_fine',
 'autoexec_bat',
 'config_sy',
 'hard_disk',
 'bus_master',
 'floppy_drive',
 'scsi_controller',
 'rom_bio',
 'tape_backup',
 'config_sys',
 'sampling_rate',
 'disk_controller',
 'bulletin_board',
 'adaptec_scsi',
 'parity_error',
 'enhanced_mode',
 'aspi_do',
 'double_buffer',
 'double_buffering',
 'floppy_drif',
 'jumbo_tape',
 'mirror_site',
 'progman_exe',
 'scsiha_sy',
 'scsiha_sys',
 'windows_grp',
 'nntp_post',
 'win_nt',
 'dec_alpha',
 'windows_nt

 'computer_science',
 'vancouver_canada',
 'ubc_ca',
 'works_fine',
 'gene_wright',
 'raider_net',
 'nntp_post',
 'newsreader_tin',
 'charles_parr',
 'tom_dietrich',
 'stuff_delet',
 'amherst_college',
 'dod_ama',
 'flame_bait',
 'nntp_post',
 'ken_snyder',
 'suzuki_gs',
 'ama_dod',
 'msg_sensitivity',
 'anecdotal_evidence',
 'clinical_trial',
 'nntp_post',
 'carson_washington',
 'floppy_disk',
 'guy_kuo',
 'power_supply',
 'floppy_drive',
 'hard_drive',
 'simm_slot',
 'ribbon_cable',
 'tower_assembly',
 'nntp_post',
 'new_york',
 'distribution_na',
 'mailing_list',
 'oklahoma_norman',
 'fri_apr',
 'oklahoma_city',
 'boise_idaho',
 'capital_district',
 'ecn_uoknor',
 'internet_uucp',
 'config_sy',
 'dx_eisa',
 'pittsburgh_pa',
 'talk_politic',
 'engineering_carnegie',
 'mellon_pittsburgh',
 'nntp_post',
 'andrew_infante',
 'andy_infante',
 'dod_joan',
 'north_acpub',
 'youve_got',
 'daily_basis',
 'durham_nc',
 'computer_science',
 'distribution_na',
 'deepak_chhabra',
 'dan_kelly',
 '

 'batf_spokesperson',
 'fully_automatic',
 'carrying_revolver',
 'practicing_shoot',
 'regular_patrolman',
 'shooting_range',
 'recall_read',
 'unix_nyc',
 'netcom_online',
 'communications_service',
 'login_gu',
 'dip_switch',
 'switch_setting',
 'wiretap_chip',
 'serial_number',
 'graham_toal',
 'escrow_agencie',
 'steven_bellovin',
 'corporate_acceptance',
 'des_keyseach',
 'shelf_cheap',
 'escrow_agent',
 'bell_laboratorie',
 'fort_meade',
 'sore_thumb',
 'verified_externally',
 'saratoga_spring',
 'skidmore_college',
 'jay_rogoff',
 'robert_hite',
 'cornell_univ',
 'cs_dept',
 'edward_ted',
 'ithaca_ny',
 'distribution_world',
 'paradise_svga',
 'california_polytechnic',
 'san_lui',
 'nntp_post',
 'distribution_world',
 'western_australia',
 'edward_bolson',
 'non_coplanar',
 'exactly_fitt',
 'nntp_post',
 'express_access',
 'online_communication',
 'steinn_sigurdsson',
 'commercial_mining',
 'long_term',
 'san_diego',
 'blind_faith',
 'absolute_truth',
 'msg_sensitivity',
 'brain

 'edward_ted',
 'ithaca_ny',
 'bruce_klopfenstein',
 'simon_fraser',
 'vancouver_canada',
 'virtual_memory',
 'vram_simms',
 'good_luck',
 'county_srsd',
 'nntp_post',
 'distribution_world',
 'vms_vnews',
 'soviet_union',
 'limiting_govt',
 'mr_grinch',
 'steve_hendrick',
 'pure_communism',
 'uh_huh',
 'utopian_dream',
 'nntp_post',
 'imperial_college',
 'makes_sense',
 'border_width',
 'default_visual',
 'vinfo_visual',
 'research_centre',
 'intelligent_machine',
 'der_mouse',
 'computer_science',
 'pcx_bmp',
 'simon_fraser',
 'university_burnaby',
 'distribution_na',
 'evan_pritchard',
 'conf_nam',
 'nntp_post',
 'jay_rogoff',
 'roger_lustig',
 'crux_princeton',
 'nntp_post',
 'magnus_ac',
 'ohio_state',
 'long_term',
 'arnie_skurow',
 'benedikt_rosenau',
 'catholic_church',
 'ray_ingle',
 'toronto_ontario',
 'jody_levine',
 'dod_kv',
 'ontario_hydro',
 'pf_ride',
 'drinks_hour',
 'nntp_post',
 'apr_gmt',
 'mohit_goyal',
 'oh_yeah',
 'trinitron_tube',
 'two_third',
 'nntp_post',
 'di

 'newsreader_tin',
 'emergency_room',
 'distribution_world',
 'hard_disk',
 'floppy_drive',
 'hard_drive',
 'floppy_drif',
 'disk_driv',
 'distribution_usa',
 'nntp_posting',
 'youve_got',
 'good_luck',
 'cellular_infrastructure',
 'motorola_inc',
 'tape_backup',
 'jumbo_tape',
 'backup_unit',
 'nntp_post',
 'san_jose',
 'colorado_boulder',
 'red_wing',
 'spot_colorado',
 'roger_maynard',
 'bay_area',
 'roland_dreier',
 'san_francisco',
 'detroit_r',
 'hat_trick',
 'biased_ref',
 'toronto_maple',
 'united_state',
 'nic_funet',
 'public_domain',
 'pgp_public',
 'soda_berkeley',
 'ef_ad',
 'fc_ac',
 'via_anonymous',
 'capriccioso_newsreader',
 'tin_version',
 'john_perry',
 'wimsey_bc',
 'united_kingdom',
 'doc_ic',
 'derek_atkin',
 'nntp_post',
 'jim_frost',
 'centerline_software',
 'default_colormap',
 'cirrus_logic',
 'elias_davidsson',
 'middle_east',
 'mein_kampf',
 'brendan_mckay',
 'aryan_race',
 'chris_metcalfe',
 'mixed_marriag',
 'mixed_stock',
 'unconventional_proposal',
 'cha

 'makes_sense',
 'recently_bought',
 'parity_error',
 'ami_bio',
 'public_access',
 'research_triangle',
 'chapel_hill',
 'north_carolina',
 'self_defense',
 'clayton_cramer',
 'district_court',
 'optilink_corporation',
 'petaluma_ca',
 'mine_relation',
 'mutual_consent',
 'optilink_cramer',
 'uunet_pyramid',
 'greg_hennessy',
 'inflaming_passion',
 'host_cunixb',
 'nntp_posting',
 'frank_benson',
 'peter_garfiel',
 'new_york',
 'ive_seen',
 'new_jersey',
 'pay_attention',
 'darren_provine',
 'object_orient',
 'clear_implication',
 'computer_science',
 'rest_assur',
 'straw_man',
 'mark_fulk',
 'los_angele',
 'non_profit',
 'vice_president',
 'added_forward',
 'original_sender',
 'space_dig',
 'lunar_surface',
 'rockwell_international',
 'immediate_release',
 'ventura_county',
 'rwtms_munizb',
 'thousand_oak',
 'nntp_post',
 'chris_behanna',
 'riceburner_respect',
 'jesus_christ',
 'religious_war',
 'accepting_jeesus',
 'christians_inject',
 'immature_flame',
 'intended_audience',
 'wi

 'nntp_post',
 'distribution_world',
 'hardy_washington',
 'washington_seattle',
 'greatly_appreciate',
 'emm_exe',
 'nntp_post',
 'new_york',
 'several_hundred',
 '_',
 'court_order',
 'united_state',
 'civil_libertie',
 'tie_breaker',
 'bodily_harm',
 'misc_legal',
 'william_december',
 'atlantic_cement',
 'eminent_domain',
 'apr_gmt',
 'hard_disk',
 'norton_speedisk',
 'nntp_posting',
 'bell_northern',
 'bmerh_reply',
 'workspace_manager',
 'virtual_desktop',
 'cs_dept',
 'hewlett_packard',
 'long_term',
 'original_packag',
 'access_unix',
 'denver_math',
 'nyx_public',
 'laser_printer',
 'perfect_condition',
 'six_month',
 'kirk_peterson',
 'nntp_post',
 'mon_apr',
 'distribution_na',
 'soda_berkeley',
 'austin_texas',
 'san_diego',
 'las_vegas',
 'oklahoma_city',
 'swap_meet',
 'nntp_post',
 'distribution_world',
 'awful_lot',
 'chris_herringshaw',
 'newsgroup_split',
 'proposed_newsgroup',
 'split_personally',
 'charset_iso',
 'content_transfer',
 'content_type',
 'mime_version',

 'hand_held',
 'automobile_accident',
 'paul_reimer',
 'de_arra',
 'nntp_post',
 'evans_sutherland',
 'corp_salt',
 'lake_city',
 'get_rid',
 'brian_kendig',
 'old_testament',
 'laboratory_tucson',
 'lunar_planetary',
 'wishful_think',
 'brian_ceccarelli',
 'nntp_post',
 'distribution_world',
 'host_kelvin',
 'jet_propulsion',
 'lab_telo',
 'ron_baalke',
 'vms_vnews',
 'pasadena_ca',
 'around_jupiter',
 'temporary_orbit',
 'comet_shoemaker',
 'jovian_orbit',
 'aggravation_instead',
 'nntp_post',
 'california_institute',
 'clipper_chip',
 'white_house',
 'law_enforcement',
 'alumni_caltech',
 'strong_cryptography',
 'employ_voice',
 'encryption_threaten',
 'protect_electronic',
 'unbreakable_commercial',
 'hal_finney',
 'false_tension',
 'dorothy_dennings',
 'sophisticated_encryption',
 'harmoniously_balanc',
 'reasoned_balanc',
 'nntp_post',
 'opinions_express',
 'eric_bosco',
 'oracle_corp',
 'oracle_corporation',
 'redwood_shore',
 'unauthenticated_user',
 'ctrl_alt',
 'nntp_post',
 

 'ucalgary_ca',
 'newsreader_tin',
 'registration_fee',
 'nntp_post',
 'carnegie_mellon',
 'pittsburgh_pa',
 'po_andrew',
 'getting_rid',
 'nntp_post',
 'california_institute',
 'keith_allan',
 'political_atheist',
 'keith_ryan',
 'lloyd_caltech',
 'laws_dictat',
 'undesired_behavior',
 'nntp_post',
 'new_york',
 'silver_ucs',
 'eddie_murray',
 'cincinnati_red',
 'distribution_world',
 'newsreader_tin',
 'michael_gerhard',
 'neuss_voice',
 'preussenstrasse_germany',
 'ami_bio',
 'steve_dyer',
 'consulting_cambridge',
 'harvard_rayssd',
 'linus_spdcc',
 'last_night',
 'gerald_olchowy',
 'toronto_chemistry',
 'pat_burn',
 'distribution_usa',
 'imagewriter_ii',
 'civil_right',
 'rodney_k',
 'flame_thrower',
 'police_officer',
 'costa_mesa',
 'nntp_posting',
 'mailing_list',
 'manual_transmission',
 'humor_impair',
 'consumer_report',
 'mitre_corporation',
 'rec_autos',
 'dear_netter',
 'timing_belt',
 'eagle_talon',
 'tsi_awd',
 'nntp_post',
 'distribution_world',
 'computer_science',
 'w

 'capital_punishment',
 'nntp_post',
 'distribution_world',
 'michael_adam',
 'southern_california',
 'los_angele',
 'billion_dollar',
 'moon_resident',
 'developed_ther',
 'gene_wright',
 'successfully_keep',
 'theporch_raid',
 'joy_joy',
 'new_zealand',
 'clayton_cramer',
 'child_molester',
 'optilink_corporation',
 'petaluma_ca',
 'mine_relation',
 'mutual_consent',
 'optilink_cramer',
 'uunet_pyramid',
 'john_daker',
 'germano_caronni',
 'swiss_federal',
 'fifth_amendment',
 'distribution_world',
 'ann_arbor',
 'parallel_port',
 'asad_mb',
 'bis_dial',
 'eff_asp',
 'hal_bb',
 'net_hq',
 'newsgroups_pcboard',
 'serving_ann',
 'uupcb_kmail',
 'robert_desonia',
 'arbor_mi',
 'copy_kmail',
 'hal_ann',
 'rdd_winqwk',
 'unregistered_evaluation',
 'opposing_team',
 'penalty_shot',
 'nntp_post',
 'ive_seen',
 'ubc_ca',
 'monthian_buntan',
 'nntp_post',
 'distribution_world',
 'mathematical_science',
 'johns_hopkin',
 'baltimore_md',
 'homewood_academic',
 'host_jhunix',
 'hcf_jhu',
 'major

 'neo_nazis',
 'rob_lanphi',
 'anti_semitism',
 'hide_behind',
 'host_magnusug',
 'magnus_ac',
 'ohio_state',
 'last_night',
 'nntp_posting',
 'ryan_scharfy',
 'mark_wilson',
 'red_herr',
 'clintons_immunization',
 'free_immunization',
 'health_care',
 'immunization_program',
 'irresponible_parent',
 'jeffrey_linder',
 'kids_immunization',
 'noticed_clintonite',
 'pushing_universal',
 'stimulus_package',
 'melbourne_australia',
 'monash_university',
 'hercules_graphite',
 'melb_australia',
 'tom_haapanen',
 'iskandar_taib',
 'hard_cod',
 'ft_lauderdale',
 'robert_nichol',
 'truetype_font',
 'public_access',
 'south_africa',
 'mark_wilson',
 'child_molester',
 'memory_serf',
 'xavier_gallagh',
 'falkland_island',
 'computer_science',
 'gordon_bank',
 'jxp_skepticism',
 'nntp_post',
 'distribution_usa',
 'california_institute',
 'aaron_ray',
 'bear_arm',
 'host_sandman',
 'recall_correctly',
 'nntp_post',
 'artificial_intelligence',
 'united_state',
 'white_house',
 'long_term',
 'clinto

 'window_manager',
 'escape_sequence',
 'title_bar',
 'ide_vs',
 'megs_sec',
 'wayne_smith',
 'stuff_deleted',
 'low_profile',
 'multitasking_os',
 'nntp_post',
 'case_western',
 'reserve_university',
 'cleveland_ohio',
 'thor_in',
 'waterloo_ontario',
 'mortice_kern',
 'nntp_post',
 'distribution_world',
 'bill_clinton',
 'billion_dollar',
 'mike_cobb',
 'raise_tax',
 'spending_cut',
 'champaign_urbana',
 'distribution_usa',
 'best_offer',
 'hp_laserjet',
 'nntp_post',
 'distribution_world',
 'scott_roby',
 'dale_handheld',
 'bill_clinton',
 'batf_fbi',
 'murders_almost',
 'waco_today',
 'always_rethink',
 'chatham_nov',
 'foreign_troop',
 'havent_rethought',
 'william_pitt',
 'tear_gas',
 'bleeding_heart',
 'heart_bleed',
 'sucking_heartless',
 'undeniable_truth',
 'nntp_posting',
 'go_ahead',
 'albert_einstein',
 'dod_darkman',
 'hospital_youngstown',
 'john_daker',
 'nntp_post',
 'nntp_post',
 'distribution_na',
 'vms_vnew',
 'johns_hopkin',
 'nntp_post',
 'colorado_spring',
 'nntp

 'heat_sink',
 'power_supply',
 'hard_disk',
 'nntp_post',
 'distribution_usa',
 'ford_taurus',
 'stratus_computer',
 'manual_transmission',
 'cruise_control',
 'steering_wheel',
 'fm_cassette',
 'floppy_disk',
 'autoexec_bat',
 'config_sys',
 'top_ten',
 'clayton_cramer',
 'line_communication',
 'services_gu',
 'eric_smith',
 'lincoln_slavery',
 'seriously_doubt',
 'petri_pihko',
 'accepting_jeesus',
 'mutually_exclusive',
 'conclusive_evidence',
 'nntp_post',
 'user_interface',
 'voice_fax',
 'dear_netter',
 'nntp_post',
 'news_gateway',
 'cs_utexa',
 'youve_got',
 'public_access',
 'unix_brookline',
 'white_house',
 'law_enforcement',
 'nntp_post',
 'host_solntze',
 'jon_livesey',
 'wpd_sgi',
 'gregg_jaeger',
 'inimitable_rushdie',
 'human_being',
 'nntp_post',
 'greatly_appreciat',
 'serial_number',
 'get_rid',
 'colorado_boulder',
 'please_respond',
 'core_dump',
 'david_rex',
 'include_xm',
 'null_null',
 'xtappcontext_app',
 'failed_requ',
 'failed_request',
 'major_opcode',
 'x

 'spagthorpe_vik',
 'distribution_usa',
 'nntp_post',
 'host_enterpoop',
 'mailing_list',
 'motif_widget',
 'ronald_van',
 'directory_pub',
 'serial_number',
 'fidonet_node',
 'eric_choi',
 'nntp_post',
 'newsreader_tin',
 'atf_burn',
 'dividian_ranch',
 'electric_oven',
 'metro_area',
 'popular_electric',
 'stoves_outside',
 'wood_stove',
 'brent_irvine',
 'inside_ignite',
 'nntp_post',
 'southern_california',
 'los_angele',
 'los_angel',
 'fogbound_child',
 'jack_schmidl',
 'aludra_usc',
 'defamation_league',
 'nntp_post',
 'distribution_usa',
 'havent_seen',
 'colorado_boulder',
 'spot_colorado',
 'space_station',
 'long_term',
 'orbital_element',
 'san_diego',
 'deg_deg',
 'earth_orbit',
 'satellite_launch',
 'mclean_va',
 'baltimore_md',
 'orbital_debris',
 'upper_atmosphere',
 'mu_radar',
 'nntp_post',
 'bear_arm',
 'well_regulated',
 'clipper_chip',
 'distribution_na',
 'white_house',
 'carry_handgun',
 'assault_rifle',
 'constitutions_guarantee',
 'robert_ward',
 'fremont_ca',


 'clipper_chip',
 'carnegie_mellon',
 'pittsburgh_pa',
 'serial_number',
 'diffie_hellman',
 'session_key',
 'dorothy_denn',
 'law_enforcement',
 'digitized_voice',
 'scrambling_compared',
 'des_mode',
 'weeks_ago',
 'scott_roby',
 'human_stupidity',
 'batf_fbi',
 'murders_almost',
 'waco_today',
 'alan_greig',
 'dundee_institute',
 'janet_dundee',
 'never_underestimate',
 'nntp_post',
 'hardy_washington',
 'jim_lefebvre',
 'richard_casare',
 'atlanta_georgia',
 'gatech_prism',
 'georgia_institute',
 'hplabs_ncar',
 'purdue_rutger',
 'uucp_decvax',
 'gt_gt',
 'kaan_timucin',
 'talk_politic',
 'thomas_parsli',
 'glenn_stone',
 'nntp_post',
 'lawrence_livermore',
 'llnl_gov',
 'jacobs_ladder',
 'makes_sense',
 'vida_morkuna',
 'nntp_post',
 'distribution_usa',
 'nazi_germany',
 'atf_burn',
 'dividian_ranch',
 'tv_coverage',
 'ordo_templi',
 'thirty_year',
 'oriental_templar',
 'thyagi_morgoth',
 'soft_drink',
 'nntp_post',
 'distribution_world',
 'computing_center',
 'virginia_commonweal

 'naftaly_stramer',
 'road_suite',
 'supergas_reply',
 'prime_minister',
 'ahmed_abu',
 'computer_science',
 'gordon_bank',
 'jxp_skepticism',
 'oral_tradition',
 'chuck_forsberg',
 'wa_kgx',
 'weight_rebound',
 'constitute_weight',
 'obesity_researcher',
 'nntp_post',
 'host_enterpoop',
 'mit_shm',
 'considered_lazarus',
 'lazarus_long',
 'patrick_mahan',
 'person_unnecessarily',
 'tgv_window',
 'washer_wak',
 'dec_alpha',
 'nntp_post',
 'ohio_state',
 'acs_ohio',
 'ups_cod',
 'mailing_list',
 'bmw_moa',
 'david_karr',
 'joe_senner',
 'joe_senn',
 'nntp_post',
 'distribution_world',
 'east_coast',
 'mead_data',
 'brian_curran',
 'nntp_post',
 'alan_sepinwall',
 'art_ep',
 'journalism_indiana',
 'riddle_production',
 'nntp_post',
 'last_night',
 'weeks_ago',
 'united_state',
 'white_house',
 'janet_reno',
 'violent_crime',
 'clinton_gore',
 'law_enforcement',
 'depend_upon',
 'new_hampshire',
 'rose_garden',
 'short_term',
 'police_officer',
 'health_care',
 'stimulus_package',
 'attor

 'depend_upon',
 'earth_orbit',
 'thomas_green',
 'ftp_site',
 'automatic_transmission',
 'skidmore_college',
 'nntp_post',
 'distribution_world',
 'stanley_cup',
 'vms_vnew',
 'university_meson',
 'triumf_ca',
 '_',
 'top_ten',
 'united_state',
 'clayton_cramer',
 'th_century',
 'steve_hendrick',
 'optilink_corporation',
 'petaluma_ca',
 'mine_relation',
 'mutual_consent',
 'optilink_cramer',
 'uunet_pyramid',
 'lincoln_slavery',
 'public_access',
 'unix_brookline',
 'teleuse_uim',
 'civil_war',
 'opinions_express',
 'linear_accelerator',
 'panix_public',
 'jesus_christ',
 'darius_lecointe',
 'ten_commandment',
 'christian_extremist',
 'dean_kaflowitz',
 'kills_doctor',
 'spreading_christianity',
 'david_sacco',
 'nntp_post',
 'host_enterpoop',
 'joel_joel',
 'usr_lib',
 'stuff_delet',
 'multi_screen',
 'hi_netter',
 'sunos_patch',
 'nntp_post',
 'jewish_baseball',
 'major_league',
 'roger_lustig',
 'moe_berg',
 'pablo_iglesias',
 'sandy_koufax',
 'yom_kippur',
 'crux_princeton',
 'nn

 'ringing_ear',
 'apartment_microdistrict',
 'sumgait_azerbaijan',
 'nntp_post',
 'northwestern_university',
 'mechanical_engineer',
 'distribution_world',
 'new_york',
 'extermination_ohanus',
 'longer_exist',
 'might_serve',
 'mountain_pass',
 'serdar_argic',
 'single_turkish',
 'soul_sahak',
 'soviet_armenia',
 'human_right',
 'turkish_minority',
 'sadik_ahmet',
 'western_thrace',
 'united_stat',
 'middle_east',
 'religious_persecution',
 'greek_cypriot',
 'weeks_ago',
 'orthodox_church',
 'west_bank',
 'jack_schmidl',
 'america_online',
 'middle_east',
 'bellcore_livingston',
 'israeli_soldier',
 'feigenbaum_benjamin',
 'gaza_strip',
 'highly_recommend',
 'tape_recorder',
 'makes_sense',
 'weeks_ago',
 'candida_albican',
 'immune_compromis',
 'insurance_premium',
 'non_toxic',
 'steve_dyer',
 'jon_noring',
 'line_communication',
 'services_gu',
 'anecdotal_evidence',
 'life_threaten',
 'yeast_connection',
 'incompetence_ripp',
 'anti_fungal',
 'poorly_written',
 'rocket_scientist',

 'ins_cwru',
 'reserve_university',
 'broward_horne',
 'sales_tax',
 'charles_kincy',
 'comp_sy',
 'nntp_post',
 'express_access',
 'online_communication',
 'bell_lab',
 'privately_fund',
 'west_coast',
 'ive_seen',
 'national_capital',
 'maple_leafs',
 'rachel_holme',
 'nntp_post',
 'hundred_eighty',
 'persian_army',
 'persian_troop',
 'seventh_century',
 'zaurak_kamsarakan',
 'nntp_post',
 'california_institute',
 'host_punisher',
 'keith_allan',
 'political_atheist',
 'keith_ryan',
 'founding_father',
 'nntp_post',
 'distribution_world',
 'computer_science',
 'abpsoft_mehl',
 'andre_beck',
 'brain_inside',
 'dresden_de',
 'nntp_post',
 'content_type',
 'mime_version',
 'text_plain',
 'mark_crispin',
 'nntp_post',
 'frank_odwyer',
 'makes_sense',
 'computer_science',
 'christian_morality',
 'simon_clippingdale',
 'simon_simon',
 'evelyn_conlon',
 'hewlett_packard',
 'ftp_site',
 'philadelphia_pm',
 'pittsburg_pm',
 'notre_dame',
 'nntp_post',
 'geoff_sanderson',
 'ericsson_telecom',


 'distribution_usa',
 'years_ago',
 'informix_software',
 'recall_read',
 'dick_king',
 'heavy_water',
 'good_luck',
 'electronic_odometer',
 'nntp_post',
 'ohio_state',
 'acs_ohio',
 'john_kelsey',
 'brute_force',
 'known_plaintext',
 'james_ebright',
 'keyseach_machine',
 'nntp_post',
 'gun_control',
 'paul_prescod',
 'houston_tx',
 'alt_cosuard',
 'bailey_bb',
 'bis_bis',
 'houston_texas',
 'jim_wray',
 'ye_olde',
 'yob_sccsi',
 'waiting_period',
 'cathy_smith',
 'nyah_nyah',
 'background_check',
 'fm_synthesis',
 'stephen_lau',
 'nntp_post',
 'computing_service',
 'hard_core',
 'macquarie_university',
 'john_carr',
 'sydney_australia',
 'reverse_engineer',
 'hackers_spook',
 'henry_spencer',
 'earth_orbit',
 'daniel_burstein',
 'moon_land',
 'lunar_orbit',
 'gravity_assist',
 'nntp_post',
 'distribution_world',
 'years_ago',
 'jim_jone',
 'sun_microsystem',
 'nntp_post',
 'blaine_gardner',
 'evans_sutherland',
 'motorcycle_detail',
 'detailing_tip',
 'nntp_post',
 'earth_lectronic'

 'cs_dept',
 'edward_ted',
 'ithaca_ny',
 'major_league',
 'jiann_ming',
 'rickey_henderson',
 'clear_waiver',
 'nntp_post',
 'mailing_list',
 'digital_equipment',
 'stuff_deleted',
 'stuff_delet',
 'net_etiquette',
 'tom_testagrossa',
 'nntp_post',
 'colorado_boulder',
 'rintintin_colorado',
 'purdue_university',
 'big_brother',
 'jim_jone',
 'john_redelfs',
 'next_mormon',
 'whos_next',
 'herman_rubin',
 'get_rid',
 'ancient_mayan',
 'phil_fraer',
 'southwestern_louisiana',
 'televison_repo',
 'weeks_ago',
 'tank_bag',
 'parking_lot',
 'nntp_post',
 'serial_number',
 'encryption_scheme',
 'illegal_cipher',
 'system_transmit',
 'host_camelot',
 'nntp_post',
 'distribution_world',
 'autoexec_bat',
 'config_sy',
 'recently_purchas',
 'config_sys',
 'toshiba_cdrom',
 'adaptec_scsi',
 'aspi_do',
 'nntp_post',
 'ohio_state',
 'acs_ohio',
 'works_fine',
 'plain_silly',
 'ryan_scharfy',
 'recreational_drug',
 'december_starr',
 'nntp_post',
 'earl_wallace',
 'self_defense',
 'getting_rid',
 

 'nntp_post',
 'grad_student',
 'ecst_csuchico',
 'apr_gmt',
 'best_offer',
 'applied_mathematic',
 'joseph_askew',
 'original_poster',
 'cal_poly',
 'distribution_usa',
 'law_enforcement',
 'wall_street',
 'national_guard',
 'armored_vehicle',
 'nntp_post',
 'po_andrew',
 'engineering_carnegie',
 'mellon_pittsburgh',
 'freshman_electrical',
 'distribution_usa',
 'religious_war',
 'microsoft_corp',
 'gay_bash',
 'bob_sarver',
 'nntp_post',
 'distribution_usa',
 'nntp_post',
 'distribution_world',
 'disclaimers_apply',
 'voice_fax',
 'nec_fg',
 'ati_graphic',
 'nntp_post',
 'host_enterpoop',
 'gerard_odriscoll',
 'subwindow_mode',
 'hard_drive',
 'config_sys',
 'himem_sys',
 'maxtor_lxt',
 'adaptec_scsi',
 'aspi_do',
 'new_york',
 'standard_disclaimer',
 'mvp_surprise',
 'nhl_award',
 'worst_opinion',
 'joseph_stiehm',
 'paying_attention',
 'rangers_messi',
 'nntp_post',
 'theodore_chen',
 'station_wagon',
 'blah_blah',
 'nntp_post',
 'california_berkeley',
 'garnet_berkeley',
 'nntp_po

 'nazi_collaboration',
 'aryan_race',
 'delco_electronic',
 'electronics_kokomo',
 'chinese_restaurant',
 'msg_sensitivity',
 'steve_dyer',
 'original_poster',
 'intellectually_dishonest',
 'consulting_cambridge',
 'harvard_rayssd',
 'linus_spdcc',
 'blah_blah',
 'amino_acid',
 'essential_amino',
 'brain_re',
 'dianne_murray',
 'read_olney',
 'peer_review',
 'mothers_milk',
 'intra_ventricular',
 'peer_reviewed',
 'nntp_post',
 'henry_spencer',
 'alaska_fairbank',
 'michael_adam',
 'launch_pad',
 'moonbase_race',
 'doug_mohney',
 'sounding_rocket',
 'nntp_post',
 'california_institute',
 'dumbest_automotive',
 'nntp_post',
 'public_domain',
 'husc_harvard',
 'newsreader_tin',
 'nntp_post',
 'incredible_hulk',
 'marvel_comic',
 'mint_condition',
 'near_mint',
 'alpha_flight',
 'new_mutant',
 'silver_surfer',
 'tuesday_april',
 'star_trek',
 'appearance_sabretooth',
 'circa_ufl',
 'green_goblin',
 'grey_hulk',
 'hulk_st',
 'keown_art',
 'liefeld_bagged',
 'miller_art',
 'omega_man',
 'ro

 'nntp_post',
 'express_access',
 'online_communication',
 'davis_nicoll',
 'nntp_post',
 'jewish_baseball',
 'yasser_arafat',
 'syrias_expansion',
 'adam_shostack',
 'aiken_computation',
 'john_perry',
 'lab_harvard',
 'sexual_favor',
 'wouldnt_waste',
 'southern_lebanon',
 'gulf_war',
 'jordan_river',
 'syrian_troop',
 'israels_occupation',
 'negotiating_table',
 'nntp_post',
 'nntp_posting',
 'apr_gmt',
 'plus_minus',
 'london_ontario',
 'western_ontario',
 'bob_gainey',
 'uwo_ca',
 'newsgroups_rec',
 'sport_hockey',
 'date_fri',
 'netcom_online',
 'communication_service',
 'nntp_post',
 'opinions_express',
 'space_shuttle',
 'dillon_pyron',
 'dseg_ti',
 'host_skndiv',
 'lewisville_vax',
 'sender_unless',
 'ti_dseg',
 'gestures_padi',
 'home_texan',
 'robin_hood',
 'computer_science',
 'std_disclaimer',
 'cookamunga_tourist',
 'kent_sandvik',
 'alink_ksand',
 'cheers_kent',
 'private_activitie',
 'saving_grace',
 'mohammad_razi',
 'david_weisberger',
 'saratoga_spring',
 'skidmore_c

 'david_sternlight',
 'dsi_uscrpac',
 'omissions_except',
 'wiretap_chip',
 'escrow_agencie',
 'hard_core',
 'hackers_spook',
 'derek_atkin',
 'deposit_box',
 'nazi_germany',
 'distribution_usa',
 'standard_disclaimer',
 'rodney_k',
 'says_moop',
 'ted_frank',
 'police_officer',
 'koppenhoefer_kyle',
 'refused_submit',
 'nntp_post',
 'last_night',
 'carnegie_mellon',
 'pittsburgh_pa',
 'po_andrew',
 'software_engineer',
 'doug_gilmour',
 'cherry_coach',
 'administrative_comput',
 'nntp_post',
 'colorado_boulder',
 'spot_colorado',
 'san_francisco',
 'dod_ama',
 'marco_seirio',
 'nntp_post',
 'distribution_world',
 'top_ten',
 'nntp_post',
 'opinions_express',
 'heat_shock',
 'barbecued_food',
 'health_risk',
 'nntp_post',
 'computer_science',
 'newsreader_tin',
 'cs_dept',
 'press_conference',
 'springfield_indian',
 'math_comp',
 'online_communication',
 'san_diego',
 'high_tech',
 'distribution_world',
 'new_york',
 'non_existent',
 'extermination_ohanus',
 'longer_exist',
 'might_se

In [28]:
id2word[0]

'addition'

In [29]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('anyone', 2),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front_bumper', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('maryland_college', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_post', 1),
  ('organization', 1),
  ('park', 1),
  ('production', 1),
  ('rac_wam', 1),
  ('really', 1),
  ('rest', 1),
  ('s', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('umd', 1),
  ('university', 1),
  ('whatev', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [30]:
%%time
# This cell takes about 4 minutes to run on my machine.  -j

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 2min 47s, sys: 15.5 s, total: 3min 3s
Wall time: 2min


In [31]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.017*"key" + 0.016*"file" + 0.011*"use" + 0.011*"system" + '
  '0.010*"information" + 0.010*"code" + 0.008*"program" + 0.008*"available" + '
  '0.007*"bit" + 0.006*"display"'),
 (1,
  '0.011*"space" + 0.006*"information" + 0.006*"chip" + 0.005*"also" + '
  '0.005*"include" + 0.005*"research" + 0.004*"may" + 0.004*"science" + '
  '0.004*"system" + 0.004*"reference"'),
 (2,
  '0.358*"ax" + 0.026*"max" + 0.014*"window" + 0.009*"card" + 0.005*"color" + '
  '0.005*"graphic" + 0.004*"drive" + 0.004*"version" + 0.004*"board" + '
  '0.004*"server"'),
 (3,
  '0.028*"game" + 0.027*"team" + 0.015*"player" + 0.014*"win" + 0.013*"play" + '
  '0.012*"hockey" + 0.011*"year" + 0.010*"season" + 0.008*"nhl" + '
  '0.007*"goal"'),
 (4,
  '0.015*"say" + 0.013*"not" + 0.011*"people" + 0.010*"would" + 0.007*"think" '
  '+ 0.007*"make" + 0.007*"believe" + 0.007*"christian" + 0.007*"do" + '
  '0.006*"god"'),
 (5,
  '0.013*"armenian" + 0.011*"government" + 0.011*"gun" + 0.008*"people" + '
  '0.008*"st

In [32]:
%%time

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('\n')


Perplexity:  -8.666552095303281

Coherence Score:  0.6099308718182977


CPU times: user 14.4 s, sys: 1.98 s, total: 16.4 s
Wall time: 20.7 s


In [33]:
%%time

# Visualize the topics
#
# If you get an error like this: "pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. 
# A future version of pandas will change to not sort by default."
#
# then from the command line do: "pip install pandas==0.21.0"

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)  # sort=False ? sort=True
vis
print('\n')



CPU times: user 7.35 s, sys: 1.74 s, total: 9.09 s
Wall time: 8.9 s


In [34]:
#import pandas as pd
#pd.__version__

vis