# Seeds exploration

The idea is to expand the set of words of interest; in fact, not all the suggested words are present in the corpus, and some of them have really low frequencies. This is a problem when trying to guide to topic modelling around the concepts of interests, because these words appear in few documents. Trying to expand the set should help us in discovering better topics (and in general in doing a better analysis).

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

from pprint import pprint
import itertools

from src.dataset import Dataset

import spacy
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner']) # just keep tagger for lemmatization

In [2]:
dataset = Dataset()
tokens = dataset.load_dataset(year=None, 
                              fields={"tokens"}, 
                              courts={"Illinois Appellate Court"})

In [3]:
from collections import defaultdict
def check_corpus_presence(narcotics, weapons, investigation, filtered_tokens, topN=100, notFoundN=100):
    seen = set()
    not_found = weapons.union(investigation).union(narcotics)
    freq = defaultdict(lambda:0)
    for doc in filtered_tokens:
        for w in set(doc): 
            if w in not_found:
                seen.add(w)
                freq[w] += 1
    
    not_found -= seen
                
    sorted_words = sorted([(v,k) for k,v in freq.items()])
    if topN:
        print(', '.join([str(t) for t in sorted_words[:topN]]))
    else:
        print(', '.join([str(t) for t in sorted_words]))    
    print(f"*** Not found ({len(not_found)} words): ***")
    if notFoundN:
        print(', '.join([t for t in itertools.islice(not_found, notFoundN)]))
    else:
        print(', '.join(not_found))    
    
    return sorted_words, not_found

### Initial sets
We keep singular and plural versions of words because the pre-processing phase can produce both versions, depending on the context (we use Spacy)

In [4]:
narcotics = {'cannabis', 'cocaine', 'methamphetamine', 'drugs', 'drug', 'marijuana', 
             'ecstasy', 'lsd', 'ketamine', 'heroin', 'fentanyl', 'overdose', 'blunt'}

weapons = {'gun', 'knife', 'weapon', 'firearm', 'rifle', 'carbine', 'shotgun', 'handgun', 
           'revolver', 'musket', 'pistol', 'derringer', 'assault', 'rifle', 'sword'}

investigation = {'gang', 'mafia', 'serial',  'killer', 'rape', 'theft', 'recidivism', 
                 'arrest', 'robbery', 'cybercrime', 'cyber', 'crime'}

As we can see, many words don't appear so frequently in the collection, and some words don't even appear; if we want to try to drive the Topic Modelling process towards topics of interest, we need to expand these sets with more frequent words.

In [5]:
_ = check_corpus_presence(narcotics, weapons, investigation, tokens)

(2, 'ketamine'), (3, 'musket'), (4, 'cyber'), (5, 'fentanyl'), (9, 'ecstasy'), (40, 'carbine'), (41, 'mafia'), (61, 'derringer'), (114, 'methamphetamine'), (119, 'drugs'), (167, 'recidivism'), (191, 'lsd'), (194, 'overdose'), (253, 'sword'), (384, 'killer'), (495, 'serial'), (556, 'blunt'), (691, 'rifle'), (1165, 'heroin'), (1294, 'shotgun'), (1319, 'handgun'), (1376, 'cannabis'), (1607, 'gang'), (1717, 'marijuana'), (1839, 'revolver'), (1970, 'pistol'), (2080, 'firearm'), (2355, 'cocaine'), (2736, 'rape'), (2772, 'knife'), (4948, 'theft'), (5426, 'assault'), (6223, 'weapon'), (6591, 'robbery'), (7116, 'drug'), (7271, 'gun'), (16487, 'crime'), (17005, 'arrest')
*** Not found (1 words): ***
cybercrime


In [6]:
not_found_in_collection = {'ecstasy', 'cybercrime', 'fentanyl'}

interesting_set = narcotics.union(weapons).union(investigation)

## GoogleNews word embeddings

Using word embeddings, it should be possible to find the words that are used in the same context as our words of interest, so that we can expand the seed set with different words that are used in the same context.

In [7]:
from gensim import models

w = models.KeyedVectors.load_word2vec_format(
    '../data/models/GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
w.most_similar(positive=['cocaine'], topn=5)

[('heroin', 0.8294118046760559),
 ('crack_cocaine', 0.8008098006248474),
 ('methamphetamine', 0.7232441306114197),
 ('narcotics', 0.707099974155426),
 ('methamphetamines', 0.7007291316986084)]

In [9]:
top_words = 5

for word in interesting_set:
    if word == "blunt": # usually blunt is too generic, not referring to drugs
        similar_words = w.most_similar(positive=[word, "drug", "marijuana"], topn=top_words)
    elif word == "serial": # too generic by itself
        similar_words = w.most_similar(positive=[word, "killer"], topn=top_words)    
    else:
        similar_words = w.most_similar(positive=[word], topn=top_words)
    print(f"*** {word} ***:\n {' - '.join(map(lambda x: f'{x[0]}, {round(x[1], 2)}', similar_words))}")

*** serial ***:
 ########_##XX, 0.64 - killers, 0.6 - murderer, 0.57 - serial_killer, 0.56 - Serial, 0.55
*** mafia ***:
 mafias, 0.71 - Mafia, 0.68 - gangsters, 0.67 - underworld, 0.66 - mafioso, 0.66
*** cocaine ***:
 heroin, 0.83 - crack_cocaine, 0.8 - methamphetamine, 0.72 - narcotics, 0.71 - methamphetamines, 0.7
*** knife ***:
 kitchen_knife, 0.81 - knives, 0.77 - steak_knife, 0.75 - machete, 0.74 - carving_knife, 0.74
*** killer ***:
 killers, 0.76 - murderer, 0.66 - Killer, 0.64 - serial_killer, 0.6 - rapist, 0.57
*** blunt ***:
 drugs, 0.7 - cocaine, 0.66 - methamphetamine, 0.65 - cannabis, 0.65 - narcotics, 0.64
*** heroin ***:
 cocaine, 0.83 - Heroin, 0.77 - crack_cocaine, 0.74 - narcotics, 0.73 - methamphetamine, 0.72
*** cybercrime ***:
 Cybercrime, 0.76 - cybercrimes, 0.75 - cyber, 0.73 - cyber_criminals, 0.65 - identity_theft, 0.63
*** handgun ***:
 semiautomatic_handgun, 0.82 - .##_caliber_handgun, 0.82 - 9mm_handgun, 0.82 - gun, 0.81 - 9_mm_handgun, 0.81
*** lsd ***:
 

Some words are compounds words, which would be separated by a whitespace but in order to keep them in googleNewsVectors it has been chosen to use the underscore in order to combine them; we need to split them in singular words, because we are expanding the set using only single words.

In [10]:
top_words = 100

def addWordsToSet(wordSet, similar_words, lemmatize=False):
    if lemmatize:
        for similar_word in similar_words:
             words = [" ".join([token.lemma_ for token in nlp(similar.lower())]) for similar in similar_word[0].split("_")]  # split composite words in singular ones
             wordSet.update(words)
    else:
        for similar_word in similar_words:
            wordSet.update([w.lower() for w in similar_word[0].split("_")])


### Start expansion process
Get top n words for each word of interest. This is our starting point in order to expand the seeds.

In [11]:
new_narcotics, new_weapons, new_investigation = set(), set(), set()
for word in interesting_set:
    if word == "blunt": # usually blunt is too generic, not referring to drugs
        similar_words = w.most_similar(positive=[word, "drug", "marijuana"], topn=top_words)
    elif word == "serial": # too generic by itself
        similar_words = w.most_similar(positive=[word, "killer"], topn=top_words)
    else:
        similar_words = w.most_similar(positive=[word], topn=top_words)
        
    if word in narcotics:
        addWordsToSet(new_narcotics, similar_words)
    elif word in weapons:
        addWordsToSet(new_weapons, similar_words)
    elif word in investigation:
        addWordsToSet(new_investigation, similar_words)
    
print(f"*** New narcotics ({len(new_narcotics)} words): ***\n{', '.join([t for t in itertools.islice(new_narcotics, 50)])}")
print(f"*** New weapons ({len(new_weapons)} words): ***\n{', '.join([t for t in itertools.islice(new_weapons, 50)])}")
print(f"*** New investigation ({len(new_investigation)} words) ***:\n{', '.join([t for t in itertools.islice(new_investigation, 50)])}")

*** New narcotics (496 words): ***
nimetazepam, twins, dugs, tranquilizer, buy, yaba, benzodiazepine, injecting, adderal, ya, quat, viagra, klonopin, sildenafil, salvia, gamma, caine, controlled, laced, overdoses, psychostimulant, bzp, sedative, barbiturates, asphyxiation, diflucan, danny, acute, quinoline, rohypnol, barbiturate, ##mg, diamorphine, mrs, atarax, lorazepam, patricia, alliance, amitriptyline, vertex, swallowed, pfizer, nitrite, drugs, cr, wholesaler, decriminalizing, pills, stimulant, smokable
*** New weapons (485 words): ***
knight, unlicenced, guage, ##a#, asault, stabbed, cutter, sword, sticks, tresspass, keyblade, launcher, brawl, scimitar, ar##, guns, firearms, homemade, muzzleloaders, endangerment, abercrombie, nun, filleting, gunpowder, small, dagger, axes, spears, assult, ceremonial, .##, insas, silencer, luger, centerfire, assualts, sandals, submachine, blowgun, grievous, gauge, .9, winchester, dragon, grevious, chrome, brandished, thirddegree, cannonball, airsof

In [12]:
found_google, not_found_google = check_corpus_presence(new_narcotics, new_weapons, new_investigation, tokens)

(1, 'adderal'), (1, 'adderall'), (1, 'amyl'), (1, 'arrests'), (1, 'balls'), (1, 'blowgun'), (1, 'boning'), (1, 'bousfield'), (1, 'boxcutter'), (1, 'bupe'), (1, 'bupropion'), (1, 'burglaries'), (1, 'calabrian'), (1, 'cannibal'), (1, 'cannonade'), (1, 'chicano'), (1, 'chucky'), (1, 'convienence'), (1, 'counterterrorism'), (1, 'crips'), (1, 'darko'), (1, 'defilement'), (1, 'dentention'), (1, 'dextromethorphan'), (1, 'diflucan'), (1, 'dundon'), (1, 'duragesic'), (1, 'enactors'), (1, 'enzymes'), (1, 'escaper'), (1, 'exacto'), (1, 'fondler'), (1, 'gadi'), (1, 'ghb'), (1, 'harasser'), (1, 'hattori'), (1, 'hells'), (1, 'homocide'), (1, 'howitzer'), (1, 'hydromorphone'), (1, 'jailbreaker'), (1, 'jou'), (1, 'katana'), (1, 'khat'), (1, 'killers'), (1, 'killings'), (1, 'larcency'), (1, 'levaquin'), (1, 'longbow'), (1, 'lorcet'), (1, 'lortab'), (1, 'machinegun'), (1, 'mafias'), (1, 'mafioso'), (1, 'malware'), (1, 'matchlock'), (1, 'midazolam'), (1, 'mightier'), (1, 'minie'), (1, 'minimums'), (1, 'm

Not all the words of googleNews are present in our collection, also because we use Spacy for the lemmatization during the pre-processing phase. 
While Spacy shouldn't be used to lemmatize singular words, it should still be better to use it rather than considering directly the words without any kind of pre-processing.

In [13]:
new_narcotics_lemma, new_weapons_lemma, new_investigation_lemma = set(), set(), set()
for word in interesting_set:
    if word == "blunt": # usually blunt is too generic, not referring to drugs
        similar_words = w.most_similar(positive=[word, "drug", "marijuana"], topn=top_words)
    elif word == "serial": # too generic by itself
        similar_words = w.most_similar(positive=[word, "killer"], topn=top_words)
    else:
        similar_words = w.most_similar(positive=[word], topn=top_words)
        
    if word in narcotics:
        addWordsToSet(new_narcotics_lemma, similar_words, True)
    elif word in weapons:
        addWordsToSet(new_weapons_lemma, similar_words, True)
    elif word in investigation:
        addWordsToSet(new_investigation_lemma, similar_words, True)
    
print(f"*** New narcotics ({len(new_narcotics_lemma)} words): ***\n{', '.join([t for t in itertools.islice(new_narcotics_lemma, 50)])}")
print(f"*** New weapons ({len(new_weapons_lemma)} words): ***\n{', '.join([t for t in itertools.islice(new_weapons_lemma, 50)])}")
print(f"*** New investigation ({len(new_investigation_lemma)} words) ***:\n{', '.join([t for t in itertools.islice(new_investigation_lemma, 50)])}")


*** New narcotics (462 words): ***
nimetazepam, tranquilizer, buy, contain, yaba, benzodiazepine, salt, meltab, gan, adderal, ya, quat, viagra, lace, klonopin, sildenafil, salvia, gamma, caine, psychostimulant, bzp, sedative, asphyxiation, twin, diflucan, danny, acute, quinoline, barbiturate, rohypnol, diamorphine, medicine, mrs, atarax, lorazepam, patricia, alliance, amitriptyline, vertex, dreampharmaceutical, pfizer, nitrite, cr, wholesaler, stimulant, smokable, delirium, uncut, tablet, pot
*** New weapons (428 words): ***
occasion, volley, knight, guage, # .##mm, asault, bloodstaine, cutter, power, sword, tresspass, keyblade, liberty, launcher, brawl, scimitar, carve, mighty, homemade, endangerment, abercrombie, nun, gunpowder, small, peacock, utter, dagger, assult, use, ceremonial, silencer, luger, centerfire, submachine, blowgun, grievous, threaten, gauge, .9, winchester, dragon, grevious, chrome, thirddegree, cannonball, airsoft, &, enactment, fire, assualt
*** New investigation 

In [14]:
found_google_lemma, not_found_google_lemma = check_corpus_presence(new_narcotics_lemma, new_weapons_lemma, new_investigation_lemma, tokens)

(1, 'adderal'), (1, 'adderall'), (1, 'amyl'), (1, 'antiretroviral'), (1, 'areste'), (1, 'blowgun'), (1, 'bousfield'), (1, 'boxcutter'), (1, 'bupe'), (1, 'bupropion'), (1, 'calabrian'), (1, 'cannibal'), (1, 'cannonade'), (1, 'chicano'), (1, 'chucky'), (1, 'conscript'), (1, 'convienence'), (1, 'counterterrorism'), (1, 'darko'), (1, 'defilement'), (1, 'dentention'), (1, 'dextromethorphan'), (1, 'diflucan'), (1, 'dundon'), (1, 'duragesic'), (1, 'escaper'), (1, 'exacto'), (1, 'fondler'), (1, 'gadi'), (1, 'gangmember'), (1, 'ghb'), (1, 'harasser'), (1, 'hattori'), (1, 'hells'), (1, 'homocide'), (1, 'housebreaking'), (1, 'howitzer'), (1, 'hydromorphone'), (1, 'insa'), (1, 'jailbreaker'), (1, 'jou'), (1, 'katana'), (1, 'khat'), (1, 'larcency'), (1, 'levaquin'), (1, 'longbow'), (1, 'lorcet'), (1, 'lortab'), (1, 'machinegun'), (1, 'mafioso'), (1, 'malware'), (1, 'matchlock'), (1, 'midazolam'), (1, 'minie'), (1, 'mongol'), (1, 'musketry'), (1, 'naltrexone'), (1, 'neapolitan'), (1, 'nostra'), (1, 

In [15]:
never_found = not_found_google.union(not_found_google_lemma)
print(f"*** Never found ({len(never_found)} words): ***\n{', '.join([t for t in itertools.islice(never_found, 100)])}")

*** Never found (713 words): ***
dons, nimetazepam, snatchings, milly, driveby, rearrests, dugs, # .##mm, breakins, ##a#, sicko, investigtion, asault, yaba, confick, meltab, larcenies, reoffended, keyblade, sildenafil, gangsterism, scimitar, overdoses, ar##, psychostimulant, bzp, barbiturates, muzzleloaders, schiebel, ramrattan, antivirus, tita, salvatrucha, hemraj, quinoline, filleting, ##mg, diamorphine, birthrate, rapper, castlecops, vory, assult, nupur, cyberattacks, pleads, yobbery, matricide, insas, bredel, .##, ecrime, castlecop, centerfire, gangrape, dreampharmaceutical, assualts, sandals, swallowed, conwoman, caseloads, btk, cyberattacker, gratteri, nitrite, .9, cyberthreat, rascals, housebreaks, recidivists, pects, petersilia, grevious, robs, thirddegree, cyberweapon, threatscape, airsoft, &, norte'o, muzzleloading, decriminalizing, zakone, smokable, aarushi, carjacker, cannabinoids, breadknife, hanzo, cnp, cybertheft, ransomware, mandiant, redcoats, palladone, vendita, m1, h

### Combine and filter results
We have both the words and their lemmas, we unite them and we subtract from the resulting sets the words that don't appear in our collection.

In [16]:
expanded_narcotics = (set(narcotics).union(new_narcotics).union(new_narcotics_lemma)) - not_found_google - not_found_google_lemma
expanded_weapons = (set(weapons).union(new_weapons).union(new_weapons_lemma)) - not_found_google - not_found_google_lemma
expanded_investigation = (set(investigation).union(new_investigation).union(new_investigation_lemma)) - not_found_google - not_found_google_lemma

In [17]:
print(f"*** Expanded narcotics ({len(expanded_narcotics)} words): ***\n{', '.join([t for t in itertools.islice(expanded_narcotics, 50)])}")
print(f"*** Expanded weapons ({len(expanded_weapons)} words): ***\n{', '.join([t for t in itertools.islice(expanded_weapons, 50)])}")
print(f"*** Expanded investigation ({len(expanded_investigation)} words) ***:\n{', '.join([t for t in itertools.islice(expanded_investigation, 50)])}")

*** Expanded narcotics (370 words): ***
twins, tranquilizer, buy, contain, benzodiazepine, injecting, salt, gan, adderal, ya, quat, viagra, lace, klonopin, salvia, gamma, caine, controlled, laced, sedative, asphyxiation, twin, diflucan, danny, acute, rohypnol, barbiturate, medicine, mrs, atarax, lorazepam, patricia, alliance, amitriptyline, vertex, pfizer, drugs, cr, wholesaler, pills, stimulant, delirium, uncut, pot, overdose, tablet, phencyclidine, cardiac, pseudoephedrine, substance
*** Expanded weapons (411 words): ***
occasion, volley, knight, unlicenced, guage, stabbed, bloodstaine, cutter, power, sword, sticks, tresspass, liberty, launcher, brawl, carve, guns, mighty, firearms, homemade, endangerment, abercrombie, nun, gunpowder, small, peacock, utter, dagger, axes, spears, ceremonial, use, silencer, luger, submachine, blowgun, grievous, threaten, gauge, winchester, dragon, chrome, brandished, cannonball, enactment, fire, damocles, assualt, bersa, sexual
*** Expanded investigati

### Cleaning of not related words
Because of the underscore-splitting process (from compound words to singular words, e.g. 'new_amphetamine' becomes 'new' and 'amphetamine'; while 'amphetamine' is still a word of interest, 'new' is not), and because not all the GoogleVectors words are really meaningful in our topics of interest, it has been necessary to perform a manual cleaning process, going through the expanded seeds word-per-word, checking if its meaning was related to the corresponding topic of interest.
At the end of this process, 3 sets have been created, containing those words which are not related to the topics of interest but that were found during the expansion phase; these sets should be subtracted from the expanded sets in order to get the final seeds.

In [18]:
not_narcotics = {'suspect', 'medicine', 'gans', 'lab', 'dispensary', 'pain', 'sleeping', 'mouse', 'alliance', 'asphyxia', 'cheap', 'allergic', 'mrs', 
                 'arrest', 'sleep', 'drowning', 'vertex', 'contain', 'labs', 'heart', 'acute', 'buy', 'magic', 'pneumonia', 'usa', 'control', 'swallow', 
                 'abstracting', 'residue', 'treat', 'accidental', 'allergy', 'medications', 'dug', 'drown', 'cooks', 'nonfatal', 'quaid', 'grow', 'c', 
                 'salt', 'highly', 'traffic', 'medicinal', 'dope', 'decriminalize', 'medication', 'manufacture', 'reservation', 'precursor', 'plc', 'cook', 
                 'e', 'hepatitis', 'euphoric', 'boot', 'med', 'lace', 'lethal', 'twin', 'accidentally', 'generic', 'coroner', 'malarial', 'restraint', 'ba', 
                 'controlled', 'ultram', 'powerful', 'fit', 'mood', 'danny', 'illicit', 'toxicity', 'effects', 'commonly', 'his', 'quoad', 'n', 'legalize', 
                 'parkinson', 'fatal', 'amy', 'muscle', 'banning', 'sedentary', 'induce', 'electricity', 'patch', 'horse', 'smell', 'reaction', 'sophisticated', 
                 'ingest', 'cardiac', 'mice', 'bath', 'swallowed', 'anti', 'legalizing', 'altering', 'mmj', 'abstract', 'imitation', 'tar', 'unk', 'maker', 
                 'fatally', 'induced', 'medi', 'cocktail', 'surgical', 'grown', 'effect', 'patricia', 'co', 'ban', 'quantity', 'ya', 'clandestine', 'entertainer',
                 'new', 'animal', 'chemicals', 'storefront', 'pseudo', 'cr', 'containing', 'alias', 'newer', 'boots', 'alter', 'delirium', 'asphyxiation', 'gamma',
                 'wholesaler', 'bousfield', 'gastroscopy', 'ingesting', 'lod', 'patches', 'unichem'}
not_weapons = {'addictive', 'bloodstaine', 'grievous', 'model', 'millimeter', 'instrument', 'plate', 'flame', 'shoplift', 'm', 'disorderly', 'kitchen', 
               'pastry', 'blender', 'plated', 'drug', 'pons', 'small', 'rape', 'power', 'battery', 'criminal', 'uttering', 'arrest', 'attempt', 'bread', 
               'bone', 'bar', 'cannabis', 'maggard', 'inch', 'fish', 'bloodstained', 'nonlethal', 'hang', 'blacksmith', 'unregistere', 'chuck', 'reckless',
               'control', 'polydrug', 'grevious', 'revolutionary', 'chucks', 'utter', 'chrome', 'using', 'crystal', 'treat', 'fillet', 'grab', 'tablets', 
               'lewd', 'banta', 'harassment', 'fix', 'heroin', 'sandal', 'opiate', 'unlicensed', 'semi', 'gravity', 'cause', 'article', 'salt', 'carrying',
               'recklessly', 'narcotic', 'highly', 'bodily', 'petit', 'dope', 'medicinal', 'prize', 'decriminalize', 'toothbrush', 'carving', 'resist',
               'tablet', 'threats', 'vicodin', 'pen', 'felonious', 'occasion', 'crow', 'liberties', 'inflict', 'wield', 'affray', 'automatic', 'charge', 
               'machine', 'acto', 'x', 'harm', 'light', 'concealed', 'lace', 'forcible', 'stimulant', 'springfield', 'prosecutor', 'cocaine', 
               'methamphetamine', 'contin', 'custodial', 'powered', 'altercation', 'trespassing', 'mm', 'theft', 'load', 'attempe', 'controlled', 'butcher',
               'attack', 'illicit', 'nun', 'steroid', 'brawl', 'possessing', 'double', 'facsimile', 'threat', 'nosed', 'legalize', 'mischief', 'enactment',
               'officer', 'liberty', 'fantasy', 'caleb', 'paring', 'short', 'meat', 'green', 'crack', 'tactical', 'incident', 'juana', 'fatal', 'unlawful',
               'vicious', 'intoxication', 'on', 'marihuana', 're', 'squad', 'banning', 'carry', 'civil', 'peacock', 'nine', 'peeler', 'inflicting', 'stick',
               'synthetic', 'pon', 'leather', 'drugs', 'trespass', 'conceal', 'ceremonial', 'conduct', 'cannabi', 'encampment', 'cultivation', 'flick', 
               'confinement', 'aggravated', 'homemade', 'cock', 'h', 'epic', 'painkiller', 'bath', 'amphetamines', 'flash', 'vegetable', 'sorcery', 'first', 
               'ornamental', 'shoplifting', 'sexual', 'scissor', 'mallet', 'fife', 'unlicence', 'retractable', 'malicious', 'imitation', 'amphetamine', 
               'wrench', 'substances', 'steak', 'mighty', 'substance', 'burglary', 'centimeter', 'brittany', 'serpent', 'stanley', 'narcotics', 'causing', 
               'endangerment', 'addiction', 'sig', 'unlawfully', 'marijuana', 'bugle', 'ons', 'balls', 'forge', 'assualt', 'domestic', 'felony', 'resisting', 
               'curve', 'ball', 'unlicense', 'pare', 'stimulants', 'methadone', 'caine', 'ban', 'concealable', 'class', 'co', 'deadly', 'meth', 'prescription', 
               'sault', 'murder', 'allegedly', 'unprovoked', 'indecent', 'filet', 'box', 'hallucinogen', 'potent', 'era', 'cobra', 'robbery', 'unregistered', 
               'use', 'war', 'mari', 'molestation', 'salute', 'lascivious', 'incense', 'threatening', 'threaten', 'degree', 'curved', 'phencyclidine', 
               'possess','prescribe', 'warrior', 'abercrombie', 'bowie', 'prescribing', 'misdemeanor', 'fixed', 'carve', 'varmint', 'subds', 'tresspass', 
               'xd', 'unlicenced', 'sticks', 'enactors', 'boning', 'flaming', 'scepter', 'enactor', 'prized', 'insa', 'mightier', 'premediated', 
               'terroristic', 'sawn'}
not_investigation = {'cache', 'drink', 'mask', 'scrap', 'id', 'protective', 'rest', 'surrey', 'prison', 'heart', 'brega', 'firearm', 'bar', 'arousal',
                     'gunman', 'touch', 'accuse', 'handgun', 'embezzlement', 'psychopath', 'utter', 'misdemeanants', 'name', 'shooters', 'pizza', 
                     'truancy', 'mara', 'abortion', 'relapse', 'traffic', 'hate', 'carlo', 'incest', 'concealing', 'jewelry', 'wield', 'sheet', 
                     'joshua', 'organize', 'task', 'dotson', 'prosecutor', 'parental', 'psychopathic', 'daylight', 'banger', 'magistrate', 'offenses',
                     'detaining', 'milly', 'nicola', 'attack', 'wielding', 'santos', 'parolees', 'facilities', 'grand', 'organized', 'foxen', 
                     'postpone', 'hillside', 'in', 'arpaio', 'civilly', 'whereabouts', 'hilton', 'calculating', 'proceedings', 'reoffende', 'illinois',
                     'fedex', 'anton', 'bars', 'wnt', 'markus', 'cobras', 'debit', 'mccarthy', 'attacks', 'behavior', 'reputed', 'tool', 'ralph', 
                     'subway', 'standoff', 'apts', 'bag', 'tamper', 'patronizing', 'ted', 'fake', 'felony', 'oakdale', 'brotherhood', 'ms', 'scheme',
                     'attorney', 'night', 'moat', 'unprotected', 'bully', 'mathew', 'selby', 'rates', 'bail', 'stab', 'indecent', 'cell', 'kiddie',
                     'detention', 'w', 'releasing', 'motorcycle', 'detainer', 'correctional', 'protection', 'angels', 'los', 'shipman', 'activity',
                     'supremacist', 'brother', 'armed', 'pistol', 'grasso', 'jury', 'baseline', 'white', 'mysterious', 'youthful', 'naming', 'caseload',
                     'brazen', 'uninsurance', 'reconviction', 'response', 'tampering', 'knife', 'bali', 'aces', 'botched', 'woode', 'buy', 'sandwich',
                     'solido', 'tools', 'harold', 'facility', 'mentally', 'nonviolent', 'lewd', 'ace', 'plead', 'car', 'parole', 'break', 'petit', 
                     'offense', 'safe', 'marc', 'pantry', 'similar', 'pornography', 'carlton', 'classic', 'richard', 'johnny', 'justice', 'stole', 
                     'spiking', 'spear', 'forcible', 'juvenile', 'knowledge', 'home', 'imprison', 'jigsaw', 'pink', 'repeat', 'mischief', 'bungle', 
                     'norinco', 'sam', 'petty', 'siders', 'expert', 'squad', 'scenery', 'joe', 'artist', 'prolific', 'chew', 'refuse', 'conceal', 
                     'habitual', 'ins', 'trespass', 'queen', 'advanced', 'reincarceration', 'distraction', 'postponed', 'misconduct', 'calabrian', 
                     'cop', 'prisoner', 'ram', 'imperial', 'infection', 'blue', 'jeremy', 'black', 'computer', 'handbag', 'detroit', 'chucky', 
                     'brandish', 'ice', 'incarceration', 'security', 'identity', 'mai', 'antoni', 'prisons', 'prosecution', 'misappropriation',
                     'scare', 'cobra', 'mexican', 'teen', 'notorious', 'tin', 'levi', 'walter', 'sider', 'v', 'crime', 'masked', 'dowler', 'aryan',
                     'misdemeanor', 'torrio', 'emergency', 'handcuff', 'detachment', 'human', 'debs', 'mickey', 'mandatory', 'defilement', 'instrument',
                     'agency', 'hilltop', 'drug', 'deportation', 'zodiac', 'efrain', 'cold', 'hobo', 'naples', 'spousal', 'uttering', 'dui', 'targeted',
                     'attempt', 'detain', 'exploitation', 'incarcerated', 'unregistere', 'neighborhood', 'reconvict', 'moor', 'mortality', 'harbor',
                     'meditate', 'testify', 'scared', 'hiv', 'ee', 'sheets', 'disposition', 'joanna', 'refused', 'virus', 'wedlock', 'ex', 'caller',
                     'reentry', 'statutory', 'lords', 'exposure', 'deputy', 'convict', 'confined', 'money', 'metal', 'trojan', 'operation', 'reduce',
                     'angel', 'collar', 'hospitalization', 'apt', 'rate', 'bust', 'threat', 'straight', 'custody', 'liberty', 'card', 'dr', 'vito', 
                     'purse', 'sicilian', 'guilty', 'doll', 'jail', 'serial', 'fatal', 'revocation', 'alleged', 're', 'spike', 'return', 'recognizance',
                     'variant', 'deuces', 'accused', 'capias', 'nonforcible', 'do', 'nathan', 'pregnancy', 'conviction', 'malicious', 'lover', 
                     'substance', 'misdemeanant', 'psychiatric', 'persistent', 'melbourne', 'welfare', 'syndicate', 'detainment', 'pietro', 'graffiti',
                     'crimes', 'winkler', 'raoul', 'infringement', 'infant', 'doc', 'own', 'victim', 'ar', 'street', 'krishna', 'bundy', 'calculate', 
                     'indecently', 'within', 'rested', 'forcibly', 'lynn', 'net', 'harden', 'ill', 'worm', 'neapolitan', 'path', 'carnal', 'lab', 
                     'langner', 'daytime', 'whereabout', 'tony', 'bamber', 'clear', 'odometer', 'criminal', 'arresting', 'bandit', 'birney', 'reducing',
                     'bandana', 'rascal', 'lonely', 'bredel', 'ira', 'digital', 'tire', 'uniform', 'baron', 'biker', 'oo', 'cleared', 'target', 'touching',
                     'queens', 'indictment', 'pre', 'allege', 'slashing', 'destruction', 'patronize', 'child', 'apprehend', 'crush', 'enzyme', 'chinese',
                     'charge', 'thirst', 'offend', 'policing', 'moh', 'force', 'probation', 'overcrowd', 'teodor', 'custodial', 'pect', 'prevalence',
                     'dropout', 'teenage', 'arraignment', 'gay', 'shoot', 'yorkshire', 'bike', 'u', 'insane', 'victimization', 'tiny', 'store', 'incident',
                     'baby', 'minimum', 'antisocial', 'mass', 'family', 'project', 'offender', 'la', 'capia', 'rearreste', 'waive', 'hardened', 'mad', 
                     'confine', 'converter', 'centers', 'gord', 'sting', 'domain', 'german', 'man', 'deuce', 'deliveryman', 'detection', 'release', 'police',
                     'catalytic', 'proceeding', 'dump', 'nostra', 't', 'surrender', 'apartment', 'center', 'behind', 'meth', 'bullying', 'batterer', 'felon',
                     'willful', 'birth', 'genovese', 'consensual', 'converters', 'wooded', 'overcrowded', 'unregistered', 'reconvicted', 'zeus', 'knifepoint',
                     'apprehension', 'vigilante', 'brutally', 'upstart', 'botch', 'skinhead', 'consentual', 'mongol', 'gadi', 'helburn', 'probationers', 
                     'porn', 'vijay', 'soprano', 'redeploy', 'hells', 'waives', 'brandished', 'viruses', 'worms', 'ipr', 'returned', 'poisoner', 'arrests', 
                     'darko', 'rearresting', 'stolen', 'recidivate', 'reoffending', 'jilt', 'handcuff', 'dundon', 'chews', 'brandishing', 'patriarca', 
                     'neighborhoods', 'txt', 'offenders', 'crossbow', 'saric', 'chicano', 'rostov', 'yob', 'breakin', 'evron', 'schoolgirl', 'testifies', 
                     'jails', 'jilted', 'dentention', 'minimums', 'incarcerating', 'gp', 'bungled', 'jou', 'areste', 'statuatory', 'nets', 'kuan', 'enzymes', 
                     'jailing', 'smokin', 'transnational', 'compounder', 'msg', 'convenience', 'nellessen'}

In [19]:
final_narcotics = expanded_narcotics - not_narcotics
final_weapons = expanded_weapons - not_weapons
final_investigation = expanded_investigation - not_investigation

In [20]:
print(f"*** Final narcotics ({len(final_narcotics)} words): ***\n{', '.join([t for t in itertools.islice(final_narcotics, 50)])}")
print(f"*** Final weapons ({len(final_weapons)} words): ***\n{', '.join([t for t in itertools.islice(final_weapons, 50)])}")
print(f"*** Final investigation ({len(final_investigation)} words) ***:\n{', '.join([t for t in itertools.islice(final_investigation, 50)])}")

*** Final narcotics (233 words): ***
ganja, twins, oxycodone, digoxin, zyprexa, chara, tranquilizer, possessing, xr, lexapro, propoxyphene, juana, benzodiazepine, alprazolam, injecting, cultivating, crack, pentazocine, gan, herbal, adderal, valium, quat, viagra, klonopin, midazolam, salvia, caine, cultivation, percodan, laced, xanax, hydromorphone, psychotic, sedative, phenobarbital, dextromethorphan, ketamine, seconal, infuse, diflucan, scopolamine, diazepam, pharmaceutical, psychedelic, dosage, painkille, relaxer, rohypnol, barbiturate
*** Final weapons (193 words): ***
volley, knights, knight, blackhawk, guage, chainsaw, nunchuck, minimi, stabbed, warhead, cutter, sword, knives, holstere, lance, gatling, longbow, armored, launcher, thompson, holstered, nunchaku, shank, guns, firearms, loading, kel, gunpowder, breech, mace, dagger, axes, screwdriver, gage, blade, spears, gatle, weap, muzzle, holster, semiautomatic, silencer, loader, luger, hammer, crescent, submachine, razorblade, mu

In [21]:
_ = check_corpus_presence(final_narcotics, final_weapons, final_investigation, tokens, topN=None, notFoundN=None)

(1, 'adderal'), (1, 'adderall'), (1, 'amyl'), (1, 'antiretroviral'), (1, 'blowgun'), (1, 'boxcutter'), (1, 'bupe'), (1, 'bupropion'), (1, 'burglaries'), (1, 'cannibal'), (1, 'cannonade'), (1, 'conscript'), (1, 'convienence'), (1, 'counterterrorism'), (1, 'crips'), (1, 'dextromethorphan'), (1, 'diflucan'), (1, 'duragesic'), (1, 'escaper'), (1, 'exacto'), (1, 'fondler'), (1, 'gangmember'), (1, 'ghb'), (1, 'harasser'), (1, 'hattori'), (1, 'homocide'), (1, 'housebreaking'), (1, 'howitzer'), (1, 'hydromorphone'), (1, 'jailbreaker'), (1, 'katana'), (1, 'khat'), (1, 'killers'), (1, 'killings'), (1, 'larcency'), (1, 'levaquin'), (1, 'longbow'), (1, 'lorcet'), (1, 'lortab'), (1, 'machinegun'), (1, 'mafias'), (1, 'mafioso'), (1, 'malware'), (1, 'matchlock'), (1, 'midazolam'), (1, 'minie'), (1, 'murderers'), (1, 'murdering'), (1, 'musketry'), (1, 'naltrexone'), (1, 'nunchaku'), (1, 'nunchuks'), (1, 'occasioning'), (1, 'paraphenalia'), (1, 'pawning'), (1, 'pipes'), (1, 'pistols'), (1, 'plowshare')

In [22]:
_ = check_corpus_presence(not_narcotics, not_weapons, not_investigation, tokens)  # Check if there is something borderline which appear lot of times that should be considered as interesting

(1, 'areste'), (1, 'arrests'), (1, 'balls'), (1, 'boning'), (1, 'bousfield'), (1, 'calabrian'), (1, 'chicano'), (1, 'chucky'), (1, 'darko'), (1, 'defilement'), (1, 'dentention'), (1, 'dundon'), (1, 'enactors'), (1, 'enzymes'), (1, 'gadi'), (1, 'hells'), (1, 'insa'), (1, 'jou'), (1, 'mightier'), (1, 'minimums'), (1, 'mongol'), (1, 'neapolitan'), (1, 'nets'), (1, 'nostra'), (1, 'patriarca'), (1, 'poisoner'), (1, 'premediated'), (1, 'rearresting'), (1, 'redeploy'), (1, 'rostov'), (1, 'saric'), (1, 'sawn'), (1, 'schoolgirl'), (1, 'siders'), (1, 'skinhead'), (1, 'smokin'), (1, 'soprano'), (1, 'sorcery'), (1, 'statuatory'), (1, 'threats'), (1, 'transnational'), (1, 'tresspass'), (1, 'txt'), (1, 'varmint'), (1, 'waives'), (1, 'yob'), (2, 'arpaio'), (2, 'bali'), (2, 'bamber'), (2, 'brandished'), (2, 'bungled'), (2, 'chews'), (2, 'cooks'), (2, 'crossbow'), (2, 'enactor'), (2, 'ingesting'), (2, 'ipr'), (2, 'jails'), (2, 'kuan'), (2, 'langner'), (2, 'lod'), (2, 'msg'), (2, 'returned'), (2, 'scept

## Save to local memory

In [23]:
from src.seeds import Seeds

In [24]:
seeds = Seeds()

In [25]:
seeds.save_word2vec_seeds(new_narcotics, new_weapons, new_investigation)
seeds.save_word2vec_lemmatized_seeds(new_narcotics_lemma, new_weapons_lemma, new_investigation_lemma)
seeds.save_word2vec_combined_seeds(expanded_narcotics, expanded_weapons, expanded_investigation)
seeds.save_final_filtered_seeds(final_narcotics, final_weapons, final_investigation)

In [26]:
n, w, i = seeds.get_final_filtered_seeds()
print(len(n), len(w), len(i))

233 193 270
