# Importing Pacakges, etc.

In [None]:
import nltk
import numpy as np
import pandas as pd
import pylab
import spacy
import matplotlib.pyplot  as plt
import multiprocessing as mp
import os
import gensim
from cytoolz import *
from sklearn.pipeline import *
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction import *
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import *
from sklearn.cluster import *
from sklearn.metrics import *
from spacy import displacy
from spacy.tokens import Token
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.models import CoherenceModel, LdaModel, HdpModel, LsiModel
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
import pyLDAvis.gensim
pd.set_option('display.max_colwidth', 500)
nlp = spacy.load('en', disable=['ner'])

# Importing Data

In [None]:
df = pd.read_excel("../input/articles-final/articles_handcoded.xlsx",header=0)
df.head()

# Cleaning Text
Striping non alpha-numeric characters, lowering case, moving strings of text into a list of tokens

In [None]:
nlp = spacy.load('en')
def tokenize(text):
    return [tok.lower_ for tok in nlp.tokenizer(text) if (not tok.like_url) and (tok.is_alpha) and not (tok.is_stop)]
with mp.Pool() as p:
    df['tokens'] = p.map(tokenize, df['Text'])

# Distribution of Processes
These refer to the type of activity or process occurring in each sentence

## Which processes occur in each article?

In [None]:
(df['Process'].groupby(df['Source'])).value_counts()

In [None]:
def processmap(df):
    df['material'] = df['Process'].map(lambda x: x == 'material')
    df['mental'] = df['Process'].map(lambda x: x == 'mental')
    df['verbal'] = df['Process'].map(lambda x: x == 'verbal')
    df['attributive'] = df['Process'].map(lambda x: x == 'attributive')
    df['existential'] = df['Process'].map(lambda x: x == 'existential')
processmap(df)

In [None]:
materials = df[df['material']==True].groupby(df['Source'])
mentals = df[df['mental']==True].groupby(df['Source'])
verbals = df[df['verbal']==True].groupby(df['Source'])
attributives = df[df['attributive']==True].groupby(df['Source'])
existentials = df[df['existential']==True].groupby(df['Source'])

In [None]:
df[['Source', 'material','mental','verbal','attributive',
'existential']].pivot_table(columns=['Source'], aggfunc=np.sum)

In [None]:
df[['Source', 'material','mental','verbal','attributive',
'existential']].pivot_table(columns=['Source'], aggfunc=np.sum).plot()
plt.show()

# Nominalizations
These are the different words and phrases used to refer to the victim, Matthew Shepard, and the attackers, Henderson and McKinney

## Nominalizations by article

In [None]:
(df['MS Nom'].groupby(df['Source'])).value_counts()

In [None]:
def sourcemap(df):
    df['AP'] = df['Source'].map(lambda x: x == 'AP News')
    df['BBC'] = df['Source'].map(lambda x: x == 'BBC')
    df['Breitbart'] = df['Source'].map(lambda x: x == 'Breitbart')
    df['CNN'] = df['Source'].map(lambda x: x == 'CNN')
    df['Christian Courrier'] = df['Source'].map(lambda x: x == 'Christian Courrier')
    df['Huffington Post'] = df['Source'].map(lambda x: x == 'Huffington Post')
    df['NBC'] = df['Source'].map(lambda x: x == 'NBC')
    df['New York Times'] = df['Source'].map(lambda x: x == 'New York Times')
    df['The Guardian'] = df['Source'].map(lambda x: x == 'The Guardian')
    df['Washington Examiner'] = df['Source'].map(lambda x: x == 'Washington Examiner')
sourcemap(df)

In [None]:
ap = (df['MS Nom'].groupby(df['AP']==True)).value_counts()
bbc = (df['MS Nom'].groupby(df['BBC']==True)).value_counts()
brei = (df['MS Nom'].groupby(df['Breitbart']==True)).value_counts()
cnn = (df['MS Nom'].groupby(df['CNN']==True)).value_counts()
chri = (df['MS Nom'].groupby(df['Christian Courrier']==True)).value_counts()
huff = (df['MS Nom'].groupby(df['Huffington Post']==True)).value_counts()
nbc = (df['MS Nom'].groupby(df['NBC']==True)).value_counts()
nyt = (df['MS Nom'].groupby(df['New York Times']==True)).value_counts()
guar = (df['MS Nom'].groupby(df['The Guardian']==True)).value_counts()
wash = (df['MS Nom'].groupby(df['Washington Examiner']==True)).value_counts()

In [None]:
plt.rc('font', size= 35)          
plt.rc('axes', titlesize=15)    
plt.rc('axes', labelsize=15)   
plt.rc('xtick', labelsize=12)   
plt.rc('ytick', labelsize=12)    
plt.rc('figure', titlesize=40) 
plt.rcParams['figure.figsize'] = (25,25)
fig, axes = plt.subplots(nrows = 5, ncols = 2)
plt.subplots_adjust(hspace = .75, wspace = .5)
plt.suptitle('Use of Nominal by Article')

ap.plot(ax = axes[0,0], kind = 'barh')
plt.sca(axes[0,0])
plt.title('Associated Press')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

bbc.plot(ax = axes[0,1], kind = 'barh')
plt.sca(axes[0,1])
plt.title('BBC')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

brei.plot(ax = axes[1,0], kind = 'barh')
plt.sca(axes[1,0])
plt.title('Breitbart')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

cnn.plot(ax = axes[1,1], kind = 'barh')
plt.sca(axes[1,1])
plt.title("CNN")
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

chri.plot(ax = axes[2,0], kind = 'barh')
plt.sca(axes[2,0])
plt.title('Christian Courrier')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

huff.plot(ax = axes[2,1], kind = 'barh')
plt.sca(axes[2,1])
plt.title('Huffington Post')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

nbc.plot(ax = axes[3,0], kind = 'barh')
plt.sca(axes[3,0])
plt.title('NBC')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

nyt.plot(ax = axes[3,1], kind = 'barh')
plt.sca(axes[3,1])
plt.title('New York Times')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

guar.plot(ax = axes[4,0], kind = 'barh')
plt.sca(axes[4,0])
plt.title("The Guardian")
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

wash.plot(ax = axes[4,1], kind = 'barh')
plt.sca(axes[4,1])
plt.title('Washington Examiner')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

plt.show()

In [None]:
(df['HM Nom'].groupby(df['Source'])).value_counts()

In [None]:
ap_2 = (df['HM Nom'].groupby(df['AP']==True)).value_counts()
bbc_2 = (df['HM Nom'].groupby(df['BBC']==True)).value_counts()
brei_2 = (df['HM Nom'].groupby(df['Breitbart']==True)).value_counts()
cnn_2 = (df['HM Nom'].groupby(df['CNN']==True)).value_counts()
chri_2 = (df['HM Nom'].groupby(df['Christian Courrier']==True)).value_counts()
huff_2 = (df['HM Nom'].groupby(df['Huffington Post']==True)).value_counts()
nbc_2 = (df['HM Nom'].groupby(df['NBC']==True)).value_counts()
nyt_2 = (df['HM Nom'].groupby(df['New York Times']==True)).value_counts()
guar_2 = (df['HM Nom'].groupby(df['The Guardian']==True)).value_counts()
wash_2 = (df['HM Nom'].groupby(df['Washington Examiner']==True)).value_counts()

In [None]:
plt.rc('font', size= 35)          
plt.rc('axes', titlesize=15)    
plt.rc('axes', labelsize=15)   
plt.rc('xtick', labelsize=12)   
plt.rc('ytick', labelsize=12)    
plt.rc('figure', titlesize=40) 
plt.rcParams['figure.figsize'] = (25,25)
fig, axes = plt.subplots(nrows = 5, ncols = 2)
plt.subplots_adjust(hspace = .75, wspace = .5)
plt.suptitle('Use of Nominal by Article')

ap_2.plot(ax = axes[0,0], kind = 'barh')
plt.sca(axes[0,0])
plt.title('Associated Press')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

bbc_2.plot(ax = axes[0,1], kind = 'barh')
plt.sca(axes[0,1])
plt.title('BBC')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

brei_2.plot(ax = axes[1,0], kind = 'barh')
plt.sca(axes[1,0])
plt.title('Breitbart')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

cnn_2.plot(ax = axes[1,1], kind = 'barh')
plt.sca(axes[1,1])
plt.title("CNN")
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

chri_2.plot(ax = axes[2,0], kind = 'barh')
plt.sca(axes[2,0])
plt.title('Christian Courrier')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

huff_2.plot(ax = axes[2,1], kind = 'barh')
plt.sca(axes[2,1])
plt.title('Huffington Post')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

nbc_2.plot(ax = axes[3,0], kind = 'barh')
plt.sca(axes[3,0])
plt.title('NBC')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

nyt_2.plot(ax = axes[3,1], kind = 'barh')
plt.sca(axes[3,1])
plt.title('New York Times')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

guar_2.plot(ax = axes[4,0], kind = 'barh')
plt.sca(axes[4,0])
plt.title("The Guardian")
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

wash_2.plot(ax = axes[4,1], kind = 'barh')
plt.sca(axes[4,1])
plt.title('Washington Examiner')
plt.xlabel('Nominal')
plt.xticks(rotation = 45)
plt.ylabel('Use')

plt.show()

## Use of Passives
### Number of Passives by Article

In [None]:
passives = (df['Passive'].groupby(df['Source'])).sum()
passives.values

#### Agentless Passives

In [None]:
empties = (df['Empty Passive'].groupby(df['Source'])).sum()
empties.values

### Percentages

In [None]:
(empties/passives)*100

# Topic Modeling
### First, I need to create the bigrams and then corpus I will use to generate the topics from the articles

In [None]:
texts = df['tokens']

In [None]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
dictionary = Dictionary(texts)
corpus= [dictionary.doc2bow(text) for text in texts]

### I will use both a Latent Semantic and Hierarchical Model

In [None]:
lsimodel = LsiModel(corpus = corpus, num_topics = 10, id2word = dictionary)
lsimodel.show_topics(num_topics = 5) 

In [None]:
hdpmodel = HdpModel(corpus = corpus, id2word = dictionary)
hdpmodel.show_topics()

In [None]:
ldamodel = LdaModel(corpus = corpus, num_topics = 10, id2word = dictionary)
ldamodel.show_topics()

### And now, I can see what topics these models predict

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

# Feature Clustering

### First the text needs to be preprocessed

In [None]:
preprocessing = make_pipeline(TfidfVectorizer(analyzer=identity, min_df=3, max_df=0.3, norm='l2', use_idf=True), 
                    TfidfTransformer(norm='l2', use_idf=True)
                    )
X = preprocessing.fit_transform(df['tokens'])
X.shape

In [None]:
%%time

clusterN=10
wcss = []
siloutte_score =[]
for i in range(5, clusterN+1):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter=500, n_init=20, random_state = 0, n_jobs=4, precompute_distances=True)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    siloutte_score.append(silhouette_score(X, kmeans.labels_))

### Now I determine the number of clusters to use based on three tests

In [None]:
plt.plot(range(4, clusterN+1), silhouette_score)
plt.xticks(range(4, clusterN+1), range(4, clusterN+1))
plt.title('The Silhouette Score plot')
plt.xlabel('Number of clusters')
plt.ylabel('silouette_scores')
plt.show()

In [None]:
plt.plot(range(4, clusterN+1), wcss)
plt.xticks(range(4, clusterN+1), range(4, clusterN+1))
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
cosines = -1 * np.ones(clusterN-2)
for i in range(len(wcss)-1):
    if (wcss[i] < (wcss[i+1]+wcss[i-1])/2 ):
       cosines[i]= (-1+(wcss[i-1]-wcss[i])*(wcss[i+1]-wcss[i]))/ \
       ((1+(wcss[i-1]-wcss[i])**2)*(1+ (wcss[i+1]-wcss[i])**2))**.5

print(np.flip(np.argsort(cosines))+5)

### Based on these 3 tests, 8 appears to be a good number of clusters

In [None]:
%%time

kmeans = KMeans(7, n_jobs=-1).fit(X)
df['cluster'] = kmeans.labels_
df.groupby('cluster')['Text'].count()

# Keywords from Cluster
### With this I can determine what concepts are indicative of each cluster

In [None]:
def keywords(cluster, n=8):
    f = pd.DataFrame({'all': pd.value_counts(list(concat(df['tokens'])))})
    f['cl'] = pd.value_counts(list(concat(df[df['cluster']==cluster]['tokens'])))
    f['pmi'] = np.log2( (f['cl'] * np.sum(f['all'])) / 
                        (f['all'] * np.sum(f['cl'])) )
    return list(f['pmi'][f['all']>25].sort_values(ascending=False)[:n].index)

In [None]:
for i in range(8):
    print(i,' '.join(keywords(i)))
dist = kmeans.transform(X)
df['tokens'].iloc[dist[:,5].argsort()[:10]]