# Conspiracy Theories

A sample of texts from `r/conspiracy`

In [None]:
import pandas as pd
import numpy as np
from cytoolz import *
import spacy

pd.set_option('display.max_colwidth', 500)

In [None]:
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction import *
from sklearn.decomposition import *
from sklearn.cluster import *
from sklearn.metrics import *

In [None]:
import os
print(os.listdir('../input'))

In [None]:
nlp = spacy.load('en')

In [None]:
df = pd.read_csv("../input/clusterdata/clustering_data.csv")
df = df[df['body'].str.len()>250]
df.head()

In [None]:
df.shape

In [None]:
%%time

def tokenize(text):
    return [tok.lower_ for tok in nlp.tokenizer(text) if not tok.like_url]
df['tokens'] = df['body'].apply(tokenize)

## Make document-term matrix

In [None]:
X = TfidfVectorizer(analyzer=identity, min_df=3, max_df=0.25, norm='l2', use_idf=True) \
        .fit_transform(df['tokens'])

## K Means

Use **k-means** algorithm to group texts into 25 clusters and compute **silhoutte** coefficients:

In [None]:
%%time

kmeans = KMeans(25, n_jobs=-1).fit(X)
df['cluster'] = kmeans.labels_
df['silhouette'] = silhouette_samples(X, df['cluster'])

Silhoutte scores compare the distances among texts within a cluster to distances among texts in different clusters.  A 'good' cluster should have a large score:

In [None]:
df.groupby('cluster')['silhouette'].mean().sort_values()

The number of texts in a cluster is also instructive.  Interesting clusters are usually medium-sized. Clusters with only a few texts are picking up noise, and clusters with a large number of texts are probably incoherent.

In [None]:
df.groupby('cluster')['body'].count()

## Keywords

To get some insight into what a text cluster represents, we can find its keywords using PMI:

In [None]:
def keywords(cluster, n=10):
    f = pd.DataFrame({'all': pd.value_counts(list(concat(df['tokens'])))})
    f['cl'] = pd.value_counts(list(concat(df[df['cluster']==cluster]['tokens'])))
    f['pmi'] = np.log2( (f['cl'] * np.sum(f['all'])) / 
                        (f['all'] * np.sum(f['cl'])) )
    return list(f['pmi'][f['all']>25].sort_values(ascending=False)[:n].index)


In [None]:
for i in range(25):
    print(i,' '.join(keywords(i)))

Looks like cluster 11 has something to do with vaccines, but it's hard to tell what they're saying from keywords alone.  So, we can also find some representative texts that are close to the center of the cluster

In [None]:
dist = kmeans.transform(X)
df['body'].iloc[dist[:,17].argsort()[:20]]

Now let's try the same thing, but using glove vectors instead of tfidf:

In [None]:
def vector(text):
    return nlp(text).vector
df['vec'] = df['body'].apply(vector)

In [None]:
kmeansv = KMeans(25, n_jobs=-1).fit(X)
df['cluster'] = kmeansv.labels_
df['silhouette'] = silhouette_samples(X, df['cluster'])

In [None]:
df.groupby('cluster')['silhouette'].mean().sort_values()

In [None]:
df.groupby('cluster')['body'].count()

In [None]:
for i in range(25):
    print(i,' '.join(keywords(i)))

The moon landings were faked?!?

In [None]:
dist = kmeansv.transform(X)
df['body'].iloc[dist[:,15].argsort()[:20]]