# Seeds exploration

The idea is to expand the set of words of interest; in fact, not all the suggested words are present in the corpus, and some of them have really low frequencies. This is a problem when trying to guide to topic modelling around the concepts of interests, because these words appear in few documents. Trying to expand the set should help us in discovering better topics (and in general in doing a better analysis).

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

from pprint import pprint

from src.dataset import Dataset

In [2]:
dataset = Dataset()
tokens = dataset.load_dataset(year=None, 
                              tokens=True, 
                              courts={"Illinois Supreme Court"})

In [3]:
from collections import defaultdict
def check_presence(narcotics, weapons, investigation, filtered_tokens):
    seen = set()
    not_found = set(weapons + investigation + narcotics)
    freq = defaultdict(lambda:0)
    for doc in filtered_tokens:
        for w in set(doc): 
            if w in not_found:
                seen.add(w)
                freq[w] += 1
    
    not_found -= seen
                
    sorted_words = sorted([(v,k) for k,v in freq.items()])
    print(', '.join([str(t) for t in sorted_words]))
    print(f"*** Not found ({len(not_found)} words): ***")
    print(', '.join(not_found))
    
    return sorted_words, not_found

We keep singular and plural versions of words because the pre-processing phase can produce both versions, depending on the context (we use Spacy)

In [4]:
narcotics = ['cannabis', 'cocaine', 'methamphetamine', 'drugs', 'drug', 'marijuana', 
             'ecstasy', 'lsd', 'ketamine', 'heroin', 'fentanyl', 'overdose']

weapons = ['gun', 'knife', 'weapon', 'firearm', 'rifle', 'carbine', 'shotgun', 'handgun', 
           'revolver', 'musket', 'pistol', 'derringer', 'assault', 'rifle', 'sword', 'blunt']

investigation = ['gang', 'mafia', 'serial',  'killer', 'rape', 'theft', 'recidivism', 
                 'arrest', 'robbery', 'cybercrime', 'cyber', 'crime']

As we can see, many words don't appear so frequently in the collection, and some words don't even appear; if we want to try to drive the Topic Modelling process towards topics of interest, we need to expand these sets with more frequent words.

In [5]:
check_presence(narcotics, weapons, investigation, tokens)

(1, 'cyber'), (1, 'ketamine'), (2, 'musket'), (5, 'mafia'), (8, 'derringer'), (14, 'drugs'), (18, 'methamphetamine'), (20, 'carbine'), (20, 'overdose'), (29, 'lsd'), (32, 'recidivism'), (100, 'sword'), (138, 'killer'), (142, 'cannabis'), (158, 'serial'), (203, 'blunt'), (204, 'handgun'), (205, 'rifle'), (265, 'shotgun'), (288, 'marijuana'), (290, 'heroin'), (320, 'cocaine'), (359, 'firearm'), (392, 'gang'), (721, 'pistol'), (773, 'knife'), (923, 'revolver'), (975, 'theft'), (1043, 'rape'), (1475, 'weapon'), (1611, 'drug'), (1941, 'gun'), (2053, 'assault'), (2279, 'robbery'), (5971, 'arrest'), (6662, 'crime')
*** Not found (3 words): ***
cybercrime, fentanyl, ecstasy


([(1, 'cyber'),
  (1, 'ketamine'),
  (2, 'musket'),
  (5, 'mafia'),
  (8, 'derringer'),
  (14, 'drugs'),
  (18, 'methamphetamine'),
  (20, 'carbine'),
  (20, 'overdose'),
  (29, 'lsd'),
  (32, 'recidivism'),
  (100, 'sword'),
  (138, 'killer'),
  (142, 'cannabis'),
  (158, 'serial'),
  (203, 'blunt'),
  (204, 'handgun'),
  (205, 'rifle'),
  (265, 'shotgun'),
  (288, 'marijuana'),
  (290, 'heroin'),
  (320, 'cocaine'),
  (359, 'firearm'),
  (392, 'gang'),
  (721, 'pistol'),
  (773, 'knife'),
  (923, 'revolver'),
  (975, 'theft'),
  (1043, 'rape'),
  (1475, 'weapon'),
  (1611, 'drug'),
  (1941, 'gun'),
  (2053, 'assault'),
  (2279, 'robbery'),
  (5971, 'arrest'),
  (6662, 'crime')],
 {'cybercrime', 'ecstasy', 'fentanyl'})

In [6]:
not_found_in_collection = {'ecstasy', 'cybercrime', 'fentanyl'}

interesting_set = set(narcotics + weapons + investigation)

## GoogleNews word embeddings

Using word embeddings, it should be possible to find the words that are used in the same context as our words of interest, so that we can expand the seed set with different words that are used in the same context.

In [7]:
from gensim import models

w = models.KeyedVectors.load_word2vec_format(
    '../data/models/GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
w.most_similar(positive=['cocaine'], topn=5)

[('heroin', 0.8294118046760559),
 ('crack_cocaine', 0.8008098006248474),
 ('methamphetamine', 0.7232441306114197),
 ('narcotics', 0.707099974155426),
 ('methamphetamines', 0.7007291316986084)]

In [9]:
top_words = 5

for word in interesting_set:
    if word == "blunt": # usually blunt is too generic, not referring to drugs
        similar_words = w.most_similar(positive=[word, "drug", "marijuana"], topn=top_words)
    elif word == "serial": # too generic by itself
        similar_words = w.most_similar(positive=[word, "killer"], topn=top_words)    
    else:
        similar_words = w.most_similar(positive=[word], topn=top_words)
    print(f"*** {word} ***:\n {' - '.join(map(lambda x: f'{x[0]}, {round(x[1], 2)}', similar_words))}")

*** mafia ***:
 mafias, 0.71 - Mafia, 0.68 - gangsters, 0.67 - underworld, 0.66 - mafioso, 0.66
*** methamphetamine ***:
 meth, 0.9 - methamphetamines, 0.86 - crystal_methamphetamine, 0.8 - Methamphetamine, 0.79 - meth_amphetamine, 0.76
*** serial ***:
 ########_##XX, 0.64 - killers, 0.6 - murderer, 0.57 - serial_killer, 0.56 - Serial, 0.55
*** recidivism ***:
 reoffending, 0.73 - reducing_recidivism, 0.7 - reduce_recidivism, 0.69 - Recidivism, 0.69 - reconviction, 0.64
*** assault ***:
 assaults, 0.72 - assualt, 0.71 - assaulting, 0.65 - asault, 0.65 - aggravated_assault, 0.6
*** rifle ***:
 pistol, 0.84 - rifles, 0.8 - assault_rifle, 0.75 - .##_caliber, 0.73 - .##_caliber_rifle, 0.71
*** ketamine ***:
 Ketamine, 0.78 - MDMA, 0.65 - ketamine_hydrochloride, 0.65 - Ecstasy, 0.61 - horse_tranquillizer, 0.61
*** marijuana ***:
 cannabis, 0.81 - Marijuana, 0.79 - marijauna, 0.76 - marihuana, 0.75 - medicinal_marijuana, 0.74
*** drugs ***:
 drug, 0.85 - prescription_drugs, 0.69 - medication

In [10]:
top_words = 100

def addWordsToSet(wordSet, similar_words, lemmatize=False):
    if lemmatize:
        for similar_word in similar_words:
             words = [" ".join([token.lemma_ for token in nlp(similar.lower())]) for similar in similar_word[0].split("_")]  # split composite words in singular ones
             wordSet.update(words)
    else:
        for similar_word in similar_words:
            wordSet.update([w.lower() for w in similar_word[0].split("_")])


Get top n words for each word of interest. This is our starting point in order to expand the seeds.

In [11]:
new_narcotics, new_weapons, new_investigation = set(), set(), set()
for word in interesting_set:
    if word == "blunt": # usually blunt is too generic, not referring to drugs
        similar_words = w.most_similar(positive=[word, "drug", "marijuana"], topn=top_words)
    elif word == "serial": # too generic by itself
        similar_words = w.most_similar(positive=[word, "killer"], topn=top_words)
    else:
        similar_words = w.most_similar(positive=[word], topn=top_words)
        
    if word in narcotics:
        addWordsToSet(new_narcotics, similar_words)
    elif word in weapons:
        addWordsToSet(new_weapons, similar_words)
    elif word in investigation:
        addWordsToSet(new_investigation, similar_words)
    
print(f"*** New narcotics ({len(new_narcotics)} words): ***\n{', '.join(list(new_narcotics)[:50])}")
print(f"*** New weapons ({len(new_weapons)} words): ***\n{', '.join(list(new_weapons)[:50])}")
print(f"*** New investigation ({len(new_investigation)} words) ***:\n{', '.join(list(new_investigation)[:50])}")

*** New narcotics (489 words): ***
epileptic, divinorum, effects, salts, shabu, pain, pentazocine, xr, effet, analgesic, grown, resin, zugsberger, nimetazepam, methodone, pills, subutex, cytochrome, powerful, juana, citrate, pgi2, toprol, antiretrovirals, deoxy, containing, kwaso, hydrochloride, khat, seroquel, naltrexone, apcalis, zopiclone, labs, illicit, herbal, pipes, accolate, piperazines, poospatuck, cabergoline, prescription, vendita, legalizing, hepatitis, phenobarbital, tussionex, mdma, highly, fatal
*** New weapons (570 words): ***
bersa, divinorum, burlgary, peacocks, kitchen, dragon, salts, kel, swordsmen, endangerment, potent, iiia, ##cm, maces, robbery, forcible, affray, methodone, automatic, ninja, blender, longbows, aggrevated, juana, hattori, crowbar, submachine, breastworks, pistol, scissors, sawed, mm, sniper, wea, weaponry, gravity, samurai, rifles, #.##x##mm, &, unloaded, spear, brandished, sword, illicit, semiauto, herbal, tommy, sks, prescription
*** New investig

In [12]:
found_google, not_found_google = check_presence(list(new_narcotics), list(new_weapons), list(new_investigation), tokens)

(1, 'absconder'), (1, 'aces'), (1, 'affiliations'), (1, 'ambien'), (1, 'amitriptyline'), (1, 'amphetamines'), (1, 'antidepression'), (1, 'arrrest'), (1, 'assualt'), (1, 'assult'), (1, 'attacks'), (1, 'bali'), (1, 'balls'), (1, 'boots'), (1, 'botched'), (1, 'bredel'), (1, 'bullying'), (1, 'calabrian'), (1, 'chainsaw'), (1, 'chucks'), (1, 'chucky'), (1, 'cleared'), (1, 'clonazepam'), (1, 'concealed'), (1, 'cooks'), (1, 'criminals'), (1, 'crip'), (1, 'crooks'), (1, 'cyber'), (1, 'dextromethorphan'), (1, 'diazepam'), (1, 'effects'), (1, 'epic'), (1, 'familia'), (1, 'felons'), (1, 'flintlock'), (1, 'foxen'), (1, 'genovese'), (1, 'grasso'), (1, 'harasser'), (1, 'hcl'), (1, 'herion'), (1, 'hilltop'), (1, 'homocide'), (1, 'howitzer'), (1, 'id'), (1, 'indictments'), (1, 'insider'), (1, 'ketamine'), (1, 'kiddie'), (1, 'killings'), (1, 'krishna'), (1, 'langner'), (1, 'larcency'), (1, 'legalizing'), (1, 'lorazepam'), (1, 'manhunt'), (1, 'meow'), (1, 'mice'), (1, 'milly'), (1, 'minie'), (1, 'minimi

Not all the words of googleNews are present in our collection, also because we use Spacy for the lemmatization during the pre-processing phase. 
While Spacy shouldn't be used to lemmatize singular words, it should still be better to use it rather than considering directly the words without any kind of pre-processing.

In [13]:
import spacy
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner']) # just keep tagger for lemmatization

new_narcotics_lemma, new_weapons_lemma, new_investigation_lemma = set(), set(), set()
for word in interesting_set:
    if word == "blunt": # usually blunt is too generic, not referring to drugs
        similar_words = w.most_similar(positive=[word, "drug", "marijuana"], topn=top_words)
    elif word == "serial": # too generic by itself
        similar_words = w.most_similar(positive=[word, "killer"], topn=top_words)
    else:
        similar_words = w.most_similar(positive=[word], topn=top_words)
        
    if word in narcotics:
        addWordsToSet(new_narcotics_lemma, similar_words, True)
    elif word in weapons:
        addWordsToSet(new_weapons_lemma, similar_words, True)
    elif word in investigation:
        addWordsToSet(new_investigation_lemma, similar_words, True)
    
print(f"*** New narcotics ({len(new_narcotics_lemma)} words): ***\n{', '.join(list(new_narcotics_lemma)[:50])}")
print(f"*** New weapons ({len(new_weapons_lemma)} words): ***\n{', '.join(list(new_weapons_lemma)[:50])}")
print(f"*** New investigation ({len(new_investigation_lemma)} words) ***:\n{', '.join(list(new_investigation_lemma)[:50])}")


*** New narcotics (456 words): ***
epileptic, divinorum, shabu, pain, pentazocine, xr, effet, analgesic, resin, zugsberger, nimetazepam, methodone, subutex, lace, cytochrome, powerful, juana, citrate, pgi2, toprol, deoxy, traffic, kwaso, hydrochloride, khat, seroquel, naltrexone, gan, zopiclone, illicit, herbal, accolate, med, poospatuck, cabergoline, prescription, vendita, hepatitis, phenobarbital, tussionex, mdma, highly, fatal, alia, hydroxybutyrate, hydroxy, pot, drown, legalize, midazolam
*** New weapons (505 words): ***
9 mm, divinorum, burlgary, bersa, kitchen, dragon, kel, endangerment, potent, hang, iiia, robbery, forcible, affray, resist, methodone, automatic, ninja, blender, fire, lace, juana, hattori, crowbar, . # # #, submachine, pistol, mm, sniper, wea, ar # #, weaponry, gravity, samurai, rifleman, &, unloaded, unlicense, spear, sword, illicit, semiauto, herbal, tommy, prescription, keyblade, m9, fix, gauge, pen
*** New investigation (897 words) ***:
burglarize, 9 mm, ext

In [14]:
found_google_lemma, not_found_google_lemma = check_presence(list(new_narcotics_lemma), list(new_weapons_lemma), list(new_investigation_lemma), tokens)

(1, 'absconder'), (1, 'ambien'), (1, 'amitriptyline'), (1, 'antidepression'), (1, 'antiretroviral'), (1, 'arrrest'), (1, 'assualt'), (1, 'assult'), (1, 'bali'), (1, 'batterer'), (1, 'blunderbuss'), (1, 'breastplate'), (1, 'bredel'), (1, 'bugle'), (1, 'bungle'), (1, 'calabrian'), (1, 'chainsaw'), (1, 'chucky'), (1, 'clonazepam'), (1, 'crip'), (1, 'cyber'), (1, 'dextromethorphan'), (1, 'diazepam'), (1, 'dug'), (1, 'epic'), (1, 'familia'), (1, 'flintlock'), (1, 'foxen'), (1, 'gangbanger'), (1, 'gatle'), (1, 'genovese'), (1, 'grasso'), (1, 'harasser'), (1, 'hcl'), (1, 'herion'), (1, 'hilltop'), (1, 'homocide'), (1, 'howitzer'), (1, 'insider'), (1, 'ketamine'), (1, 'kiddie'), (1, 'krishna'), (1, 'langner'), (1, 'larcency'), (1, 'lorazepam'), (1, 'manhunt'), (1, 'meow'), (1, 'milly'), (1, 'minie'), (1, 'minimi'), (1, 'mmj'), (1, 'molestor'), (1, 'neonaticide'), (1, 'nonlethal'), (1, 'norinco'), (1, 'nostra'), (1, 'painkiller'), (1, 'paxil'), (1, 'phenytoin'), (1, 'polydrug'), (1, 'profiler')

In [15]:
never_found = not_found_google.union(not_found_google_lemma)
print(f"*** Never found ({len(never_found)} words): ***\n{never_found}")

*** Never found (903 words): ***
{'bersa', 'scams', 'gangmember', 'swordsmen', 'backpacker', 'phisher', 'rascals', '##cm', 'effet', 'rostov', 'convienence', 'nimetazepam', 'methodone', 'rearresting', 'combating', 'stickups', 'subutex', 'phishe', 'cytochrome', 'supects', 'crips', 'hattori', 'arrrested', 'yob', 'breastworks', 'trafficante', 'antiretrovirals', 'adware', 'wuornos', 'ndrangheta', 'sureños', 'khat', 'naltrexone', 'banff', '#.##x##mm', 'probationers', 'zakone', 'brandished', 'trendlab', 'zopiclone', 'herbal', 'accolate', 'poospatuck', 'cabergoline', 'keyblade', 'm9', 'conscripting', 'cyberfraud', 'birthrate', 'bloods', 'nuestra', 'evonitz', 'housebreakings', 'jailed', 'tussionex', 'pistols', 'waives', 'lucchese', 'riflemen', 'parabellum', 'tita', 'razorblade', 'flints', 'neighborhoods', 'pleads', 'methcathinone', 'benzodiazepine', 'clans', 'cyberdefense', 'patches', 'lightsabres', 'meltabs', 'twins', 'housebreaks', 'automatics', 'ecstasy', 'midmost', 'digoxin', 'camorra', 'pr

We have both the words and their lemmas, we unite them and we subtract from the resulting sets the words that don't appear in our collection.

In [30]:
expanded_narcotics = (set(narcotics).union(new_narcotics).union(new_narcotics_lemma)) - not_found_google - not_found_google_lemma
expanded_weapons = (set(weapons).union(new_weapons).union(new_weapons_lemma)) - not_found_google - not_found_google_lemma
expanded_investigation = (set(investigation).union(new_investigation).union(new_investigation_lemma)) - not_found_google - not_found_google_lemma

In [32]:
print(f"*** Expanded narcotics ({len(expanded_narcotics)} words): ***\n{', '.join(list(expanded_narcotics))}")
print(f"*** Expanded weapons ({len(expanded_weapons)} words): ***\n{', '.join(list(expanded_weapons))}")
print(f"*** Expanded investigation ({len(expanded_investigation)} words) ***:\n{', '.join(list(expanded_investigation))}")

*** Expanded narcotics (297 words): ***
codeine, cocktail, amitriptyline, epileptic, effects, possess, dextromethorphan, antiretroviral, baggie, lethal, stimulants, acute, addict, pain, marijuana, crack, alter, pipe, steroid, methyl, barbituate, drug, tylenol, coroner, tranquilizer, synthetic, overdose, analgesic, grown, resin, dilaudid, depressant, hypnotic, pills, skunk, xanax, marihuana, stimulant, smoke, cooks, co, arrest, surgical, lace, opium, diazepam, mushroom, hallucinogen, powerful, juana, citrate, generic, control, methadone, newer, diethylamide, pharmaceutical, pill, librium, vicodin, toxicity, mrs, e, inject, maker, antianxiety, unprescribed, methaqualone, containing, traffic, fit, patricia, hydrochloride, cr, muscle, plant, mari, prescribe, ultram, seroquel, cardiac, mescaline, medical, prescribed, gan, ketamine, decriminalize, antidepression, precursor, addiction, hcl, narcotic, seconal, pfizer, suspect, od, effect, labs, illicit, ban, meth, accidentally, pcp, pipes, ver