In [None]:
import csv
import requests
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words("dutch")
complete_case_url = 'https://uitspraken.rechtspraak.nl/inziendocument?id='

### Import Data

In [1]:
nos_df = pd.read_csv('./NOS articles/dutch-news-articles.csv')
nos_df["datetime"] = pd.to_datetime(nos_df["datetime"])

NameError: name 'pd' is not defined

In [None]:
print(nos_df["category"].unique())

In [None]:
plt.rcParams["figure.figsize"] = (20,5)

nos_date_plot = nos_df["datetime"].value_counts().resample("1M").sum()
plt.plot(nos_date_plot[:-1])
plt.title("Monthly article count of NOS articles.")
plt.xlabel('Month')
plt.ylabel('Article count')

### Clean Data

In [None]:
print(f"{len(nos_df)} articles")

word_count = 0
for i in range(len(nos_df)):
    curr = nos_df.iloc[i]
    word_count += len(curr["content"].split())
print(word_count)

In [None]:
cleaned_nos_df = nos_df.copy()
cleaned_nos_df['content'] = cleaned_nos_df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in dutch_stopwords]))


In [None]:
cleaned_word_count = 0
for i in range(len(cleaned_nos_df)):
    curr = cleaned_nos_df.iloc[i]
    cleaned_word_count += len(curr["content"].split())
print(cleaned_word_count)

In [None]:
print(f"Original count: {word_count} \n Cleaned count: {cleaned_word_count} \n Difference: {word_count - cleaned_word_count}")
print(f"{round((word_count - cleaned_word_count) / word_count * 100, 2)}% decrease")


### Filter For Drugs 

In [None]:
print(f"Original NOS dataframe contains {len(cleaned_nos_df)} articles.")

nos_drugs_df = pd.DataFrame()
drugs_words = ["drugs", "narcotica", "verdovende middelen"]

for i in range(len(cleaned_nos_df)):
    curr = cleaned_nos_df.iloc[i]
    if any(x in curr["content"] for x in drugs_words):
        nos_drugs_df = nos_drugs_df.append(curr)
        
print(f"Filtered for drugs: {len(nos_drugs_df)} articles")

### Extract Country

In [None]:
country = []
countries_df = pd.read_csv('dutch-countries.csv', encoding = "ISO-8859-1")
countries_list = list(countries_df["Countries"])
countries_list.extend(["Londen", "Belgisch", "Rio", "Russische", "Colorado", "Saudi-Arabië", "Manchester", "Groot-Brittannië", "Britse", "Deense", "Mexicaanse", "Braziliaanse", "Surinaamse", "Amerika", "Franse", "Spaanse", "Trinidad", "Amerikaanse", "Los Angeles", "Italiaanse", "Trinidad en Tobago"])

for i in range(len(nos_drugs_df)):
    curr = nos_drugs_df.iloc[i]
    text = curr.content.replace("ë", "ë")
    countries_in_content = []
    if curr["category"] == "Buitenland":
        countries_in_content = [(text.find(c), c) for c in countries_list if c in text]
        if len(countries_in_content) > 0:
            result = min(countries_in_content)[1]
            if result in {"Amerika", "Amerikaanse", "Los Angeles", "Colorado"}:
                country.append("Verenigde Staten")
            elif result == "Franse":
                country.append("Frankrijk")
            elif result == "Spaanse":
                country.append("Spanje")
            elif result == "Italiaanse":
                country.append("Italië")
            elif result == "Trinidad":
                country.append("Trinidad en Tobago")
            elif result == "Mexicaanse":
                country.append("Mexico")
            elif result == "Surinaamse":
                country.append("Suriname")
            elif result in {"Londen", "Brits", "Groot-Brittannië", "Manchester"}:
                country.append("Verenigd Koninkrijk")
            elif result in {"Braziliaanse", "Rio"}:
                country.append("Brazilië")
            elif result == "Deense":
                country.append("Denemarken")
            elif result == "Belgisch":
                country.append("België")
            elif result == "Russische":
                country.append("Rusland")
            else:
                country.append(min(countries_in_content)[1])
        else:
            country.append("unknown")
    else:
        country.append("Nederland")
            
nos_drugs_df["country"] = country

### Filter For Trafficking

In [None]:
print(f"Original article count: {len(nos_df)} \n Filtered for drugs: {len(nos_drugs_df)}")

nos_trafficking_df = pd.DataFrame()
trafficking_words = ["smokkel", "transport", "invoer", "import", "export", "uitvoer"]
# trafficking_words = ["smokkel", "transport", "invoer", "import", "export"]


for i in range(len(nos_drugs_df)):
    curr = nos_drugs_df.iloc[i]
    if any(x in curr["content"] for x in trafficking_words):
        nos_trafficking_df = nos_trafficking_df.append(curr)
    
print(f"Filtered for trafficking:  {len(nos_trafficking_df)} articles.")

In [None]:
print(len(nos_trafficking_df['country'].value_counts().index.tolist()))
nos_trafficking_df['country'].value_counts().index.tolist()


### TF-IDF / K-Means

Prepare data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import PCA

print(len(nos_trafficking_df))
documents = nos_trafficking_df["content"]
# total_stop_words = stopwords.words('dutch') + stopwords.words('english')
# vectorizer = TfidfVectorizer(max_df=0.20, min_df=10, stop_words=total_stop_words , use_idf=True,  lowercase=True)
vectorizer = TfidfVectorizer(use_idf=True,  lowercase=True)

X = vectorizer.fit_transform(documents)
print(X)
# X = X.astype('float32')

Elbow curve

In [None]:
distorsions = []
k_range = 20
for k in range(2, k_range):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=200)
    kmeans.fit_predict(X)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, k_range), distorsions)
plt.grid(True)
plt.title('Elbow curve')

Create K-means plot

In [None]:
true_k = 15
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200)

labels = model.fit_predict(X)
nos_trafficking_df["K means cluster"] = labels

X_array = X.toarray()
pca = PCA(n_components=2).fit(X_array)
XPCA = pca.transform(X_array)

u_labels = np.unique(labels)
for i in u_labels:
    plt.scatter(XPCA[labels == i , 0] , XPCA[labels == i , 1] , label = i)
plt.legend()
plt.show()

Top Clusters:

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
terms_df = pd.DataFrame()
for i in range(true_k):
    curr = []
    for ind in order_centroids[i, :19]:
        curr.append(terms[ind])
    terms_df[i] = curr

print(terms_df)

print(nos_trafficking_df["K means cluster"].value_counts())