In [6]:
import pandas as pd
import pickle
import requests
import re

## Load Dataset

In [3]:
merged_df = pd.read_pickle("saves/merged_df.pkl")
merged_df.head()

Unnamed: 0,id,verdict_date,publication_date,verdict_type,jurisdiction_type,inhoudsindicatie,case text
0,ECLI-NL-RBNNE-2021-5018,2021-01-10,2021-11-23,uitspraak,['Strafrecht'],"Productie synthetische drugs, medeplegen, prod...",\n\nRECHTBANK NOORD-NEDERLAND\nAfdeling strafr...
1,ECLI-NL-RBZUT-2003-AH9598,2003-03-06,2003-09-07,uitspraak,['Strafrecht'],Leveren grondstoffen synthetische drugs en sto...,\n\nRECHTBANK ZUTPHEN\nMeervoudige economische...
2,ECLI-NL-RBZWB-2020-2646,2020-06-23,2020-06-23,uitspraak,['Strafrecht'],plegen van voorbereidingshandelingen ten behoe...,\n\nRECHTBANK ZEELAND-WEST-BRABANT\n\nStrafrec...
3,ECLI-NL-GHAMS-2019-1601,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...
4,ECLI-NL-GHAMS-2019-1602,2019-08-05,2019-07-26,uitspraak,['Strafrecht'],medeplegen witwassen - medeplegen voorhanden h...,\n\nafdeling strafrecht\nparketnummer: 23-0017...


## Split into sentences

In [4]:
def split_into_sentences(text):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])|\n', text)
    sentences = [x for x in sentences if len(x) > 1]
    return sentences

In [7]:
sentence_list_by_word = []
sentence_list = []

for i in range(len(merged_df)):
    doc = merged_df.iloc[i]['case text']
    sentences = split_into_sentences(doc)
    sentence_list.append(sentences)
    for j in sentences:
        word_list = [x for x in j.lower().rstrip().replace('.', '').split(' ') if len(x)>0]
        sentence_list_by_word.append(word_list)
        


wordlen = 0
for i in sentence_list_by_word:
    wordlen += len(i)
    
print(f"Sentence count: {len(sentence_list_by_word)}")
print(f"Word count: {wordlen}")

Sentence count: 5301875
Word count: 88748423


## Create Word2Vec model

In [8]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

In [9]:
dutch_word2vec_model = Word2Vec(sentences=sentence_list_by_word, vector_size=100, window=5, min_count=1, workers=4)
dutch_word2vec_model.save("word2vec_dutch_court_cases.model")

In [10]:
dutch_word2vec_model = Word2Vec.load("word2vec_dutch_court_cases.model")

In [19]:
sims = dutch_word2vec_model.wv.most_similar('xtc', topn=100)
print([i[0] for i in sims])


['mdma', 'ketamine', 'lsd', 'xtc-pillen', 'speed', 'amfetamine', 'heroïne', 'methadon', 'ghb', 'amfetaminen', 'xtc-tabletten', 'mdma)', 'diazepam', 'morfine', 'ecstasy', 'oxazepam', 'fenacetine', 'amfetamine)', 'marihuana', 'steroïden', 'pillen', 'xtc)', 'metamfetamine', '(mdma)', 'ritalin', 'cocaïne', 'anabole', 'xtc,', 'methamfetamine', 'cannabis', 'xtc/mdma', '2c-b', 'temazepam', 'kamagra', 'paracetamol', 'methylfenidaat', '5793857)', 'lidocaïne', 'crack', 'tabletten', 'pillen)', 'heroïne)', 'a-pvp', 'crystal', '(speed)', 'amfetamine;', 'tabletten)', 'xtc-', '(amfetamine)', 'mdma;', 'mdma-', 'procaïne', 'cocaine', 'pmma', 'cafeïne', 'wiet', '(xtc)', 'harddrugs', '(cocaïne', 'mdma-pillen', 'oxycodon', 'heroïne;', 'coke', 'pep', 'hasj', 'pure', 'cocaïne)', '(met)amfetamine', 'gbl', 'weed', 'mefedron', 'sildenafil', 'mcpp', 'inositol', 'mdma-poeder', 'viagra', 'mdma,', 'versnijdingsmiddel', 'hash', 'heroine', 'speed)', '(speed))', 'capsules', 'opiaten', 'base', '(mdma', 'mdma/xtc', '34

## Get  drug names for Google trends

In [26]:
def create_word2vec_relevant_words(words, matches):
    word2vec_list = []
    for word in words:
        results = dutch_word2vec_model.wv.most_similar(word, topn=100)
        for i in results:
            word2vec_list.append(i[0])
            
    word2vec_list = list(set([i for i in word2vec_list if word2vec_list.count(i)>matches]))
    return word2vec_list

In [60]:
list_of_drugs = ['xtc', 'mdma', 'cocaine', 'wiet', 'speed']

word2vec_drug_list = create_word2vec_relevant_words(list_of_drugs, 4)

print(len(word2vec_drug_list))
print(word2vec_drug_list)

20
['cocaïne', 'hasj', 'methadon', 'pillen', 'mdma-pillen', 'marihuana', 'xtc-tabletten', 'paracetamol', 'hashish', 'heroïne', 'fenacetine', 'xtc-pillen', 'cocaïne;', 'amfetaminen', 'ghb', 'ketamine', 'amfetamine', 'lsd', 'mdma-poeder', 'opium']
