# Conspiracy Theories

A sample of texts from `r/conspiracy`

In [None]:
import pandas as pd
import numpy as np
from cytoolz import *
import spacy
import matplotlib.pyplot  as plt
import multiprocessing as mp

pd.set_option('display.max_colwidth', 500)

In [None]:
from sklearn.pipeline import *
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import *
from sklearn.cluster import *
from sklearn.metrics import *

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
nlp = spacy.load('en')

In [None]:
df = pd.read_csv("../input/clusterdata/clustering_data.csv")
df = df[df['body'].str.len()>250]

In [None]:
df.shape

Regretfully the parallel method below does not work in Windows.

In [None]:
%%time

def tokenize(text):
    return [tok.lower_ for tok in nlp.tokenizer(text) if (not tok.like_url) and (tok.is_alpha)]
#   and not (tok.is_stop)

with mp.Pool() as p:
    df['tokens'] = p.map(tokenize, df['body'])

By the way parentheses in the cell above after `if` are not needed. I added them for easier reading.

## Make document-term matrix and scaling

In [None]:
preprocessing = make_pipeline(TfidfVectorizer(analyzer=identity, min_df=3, max_df=0.3, norm='l2', use_idf=True), 
                    TfidfTransformer(norm='l2', use_idf=True)
                    #, StandardScaler(with_mean=False)
                    )
X = preprocessing.fit_transform(df['tokens'])
X.shape

## K Means 
Actually `scikit-learn` has a number of clustering methods for different needs.  
https://scikit-learn.org/stable/modules/clustering.html  
They provide explanitions which method works in what case and if they work well on big data sets.

Use **k-means** algorithm to group texts into up to 10 clusters and compute **silhoutte** coefficients:

In [None]:
%%time

clusterN=10
# computing WCSS measure for cluster numbers from 5 to clusterN
wcss = []
silhoutte_score =[]
for i in range(3, clusterN+1):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter=500, n_init=20, random_state = 0, n_jobs=4, precompute_distances=True)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    silhoutte_score.append(silhouette_score(X, kmeans.labels_))

Making a plot for Silhouette scores

In [None]:
plt.plot(range(3, clusterN+1), silhoutte_score)
plt.xticks(range(2, clusterN+1), range(2, clusterN+1))
plt.title('The Silhoutte Score plot')
plt.xlabel('Number of clusters')
plt.ylabel('silhoutte_scores')
plt.show()

Making a plot for Elbow method.

In [None]:
plt.plot(range(3, clusterN+1), wcss)
plt.xticks(range(2, clusterN+1), range(2, clusterN+1))
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

Or it could be computed. The first printed number is the number of preferable clusters.

In [None]:
cosines = -1 * np.ones(clusterN-2)
for i in range(1, clusterN-3):
   # check if the point is below a segment midpoint connecting its neighbors
   if (wcss[i] < (wcss[i+1]+wcss[i-1])/2 ):
       cosines[i]= (-1+(wcss[i-1]-wcss[i])*(wcss[i+1]-wcss[i]))/ \
       ((1+(wcss[i-1]-wcss[i])**2)*(1+ (wcss[i+1]-wcss[i])**2))**.5

print(np.flip(np.argsort(cosines))+3)

Looks like 8 is a good number of clusters.

In [None]:
%%time

kmeans = KMeans(8, n_jobs=-1).fit(X)
df['cluster'] = kmeans.labels_

In [None]:
df.groupby('cluster')['body'].count()

## Keywords

To get some insight into what a text cluster represents, we can find its keywords using PMI:

In [None]:
def keywords(cluster, n=20):
    f = pd.DataFrame({'all': pd.value_counts(list(concat(df['tokens'])))})
    f['cl'] = pd.value_counts(list(concat(df[df['cluster']==cluster]['tokens'])))
    f['pmi'] = np.log2( (f['cl'] * np.sum(f['all'])) / 
                        (f['all'] * np.sum(f['cl'])) )
    return list(f['pmi'][f['all']>25].sort_values(ascending=False)[:n].index)


In [None]:
for i in range(8):
    print(i,' '.join(keywords(i)))

Looks like cluster 2 has something to do with vaccines, but it's hard to tell what they're saying from keywords alone.  So, we can also find some representative texts that are close to the center of the cluster.
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
dist = kmeans.transform(X)
df['body'].iloc[dist[:,2].argsort()[:20]]

For homework do the same thing, but using glove vectors or fastText instead of tfidf like in the previous lesson notebook

# Begin Homework: Using Glove Method

## Make document-term matrix and scaling:

In [None]:
glove_doc = nlp(df['body'].iloc[0])
glove_doc.vector

In [None]:
df['vec'] = df['review_text'].apply(lambda t: nlp(t).vector)

In [None]:
baseline = make_pipeline(CountVectorizer(analyzer=identity), LogisticRegression())
baseline.fit(train['tokens'], train['wine_variant'])
baseline.score(test['tokens'], test['wine_variant'])

In [None]:
model = LogisticRegression(C=10)
model.fit(list(train['vec']), train['wine_variant'])
model.score(list(test['vec']),test['wine_variant'] )

## K Means

In [None]:
%%time

kmeans_glove = KMeans(8, n_jobs=-1).fit(Y)
df['vec'] = kmeans_glove.labels_

In [None]:
df.groupby('vec')['body'].count()

## Keywords using PMI

In [None]:
def keywords(cluster, n=20):
    g = pd.DataFrame({'all': pd.value_counts(list(concat(df['tokens'])))})
    g['cl'] = pd.value_counts(list(concat(df[df['cluster_glove']==cluster]['tokens'])))
    g['pmi'] = np.log2( (g['cl'] * np.sum(f['all'])) / 
                        (g['all'] * np.sum(f['cl'])) )
    return list(g['pmi'][g['all']>25].sort_values(ascending=False)[:n].index)

In [None]:
for i in range(8):
    print(i,' '.join(keywords(i)))

In [None]:
dist = kmeans_glove.transform(Y)
df['body'].iloc[dist[:,2].argsort()[:20]]