# Conspiracy Theories

A sample of texts from `r/conspiracy`

In [None]:
import pandas as pd
import numpy as np
from cytoolz import *
import spacy
import matplotlib.pyplot  as plt
from sklearn.linear_model import *
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *
from sklearn.decomposition import *
from sklearn.model_selection import *
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction import *
from sklearn.cluster import *
from sklearn.metrics import *
%matplotlib inline

pd.set_option('display.max_colwidth', 500)

In [None]:
nlp = spacy.load('en')

In [None]:
df = pd.read_csv("../input/clusterdata/clustering_data.csv")
df = df[df['body'].str.len()>250]

In [None]:
df.shape

In [None]:
%%time

def tokenize(text):
    return [tok.lower_ for tok in nlp.tokenizer(text) if not tok.like_url]
df['tokens'] = df['body'].apply(tokenize)

## Make document-term matrix

In [None]:
X = TfidfVectorizer(analyzer=identity, min_df=3, max_df=0.25, norm='l2', use_idf=True) \
        .fit_transform(df['tokens'])

## K Means

Use **k-means** algorithm to group texts into 10 clusters and compute **silhoutte** coefficients:

In [None]:
%%time

clusterN=10
# computing WCSS measure for cluster numbers from 5 to clusterN
wcss = []
silhoutte_score =[]
for i in range(3, clusterN+1):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter=500, n_init=20, random_state = 0, n_jobs=4)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    silhoutte_score.append(silhouette_score(X, kmeans.labels_))
    


Making a plot for Silhouette scores

In [None]:
plt.plot(range(3, clusterN+1), silhoutte_score)
xticks(range(clusterN), range(clusterN))
plt.title('The Silhoutte Score plot')
plt.xlabel('Number of clusters')
plt.ylabel('silhoutte_scores')
plt.show()

Making a plot for Elbow method.

In [None]:
# making a plot for Elbow method
from matplotlib.pyplot import xticks
plt.plot(range(3, clusterN+1), wcss)
xticks(range(clusterN), range(clusterN))
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

Looks like the best number of clusters is 5. 

In [None]:
%%time

kmeans = KMeans(5, n_jobs=-1).fit(X)
df['cluster'] = kmeans.labels_

In [None]:
df.groupby('cluster')['body'].count()

## Keywords

To get some insight into what a text cluster represents, we can find its keywords using PMI:

In [None]:
def keywords(cluster, n=20):
    f = pd.DataFrame({'all': pd.value_counts(list(concat(df['tokens'])))})
    f['cl'] = pd.value_counts(list(concat(df[df['cluster']==cluster]['tokens'])))
    f['pmi'] = np.log2( (f['cl'] * np.sum(f['all'])) / 
                        (f['all'] * np.sum(f['cl'])) )
    return list(f['pmi'][f['all']>25].sort_values(ascending=False)[:n].index)


In [None]:
for i in range(5):
    print(i,' '.join(keywords(i)))

Looks like cluster 2 has something to do with vaccines, but it's hard to tell what they're saying from keywords alone.  So, we can also find some representative texts that are close to the center of the cluster.
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
dist = kmeans.transform(X)
df['body'].iloc[dist[:,2].argsort()[:20]]

For homework do the same thing, but using glove vectors instead of tfidf like in the previous lesson notebook

## Using GloVe Vectors for Classification

In [None]:
df.head

In [None]:
glove = nlp(df['body'].iloc[0])
glove.vector

In [None]:
df['vector'] = df['body'].apply(lambda t: nlp(t).vector)

In [None]:
base = make_pipeline(CountVectorizer(analyzer=identity), LogisticRegression())
base.fit(df['body'], df['author'])
base.score(test['body'], test['author'])

In [None]:
model = LogisticRegression(C=10)
model.fit(list(train['vec']), train['author'])
model.score(list(test['vec']),test['author'] )