In [185]:
from bokeh.io import push_notebook, output_notebook, show
import os
import io
output_notebook()

# VAST Challenge 2014, MC1
### Team members: Zhili Yang, Geng Luo
## What's the difference between midterm and VAST Challenge?
The most important difference between the VAST task and midterm is we focus on different questions. For the midterm, we tried to cluster the lyrics of different songs by different clustering algorithms, and to find the difference between male and female artists by applying **tf-idf transformer**. However, in the VAST Challenge, we want to find the kidnappers and co-conspirators in a criminal event, and identify the structure of an organization called **Protectors of Kronos (POK)**. 
For solving the problems, we applied multiple tools, including **word2vector, clustering algorithms and networkx**.

## Import datasets
In the first step, we read the two articles that document the history of organization POK. What's more, we also defined some stop words and characters that need to be removed from the text.

In [186]:
import re

CHARACTERS_TO_SPLIT = """\[\].,():;-!?\n\""""
with open("data/stopwords.txt", "r") as f:
    STOP_WORDS = set(f.read().strip().split("\n"))

with open("data/HistoricalDocuments/10_year_historical_document.txt", encoding = "ISO-8859-1") as f:
    ##hist_10_year_doc = re.split(r"\.|\n", f.read().strip())
    hist_10_year_doc = f.read().strip().split("\n")

with open("data/HistoricalDocuments/5_year_report_document.txt", encoding = "ISO-8859-1") as f:
    hist_5_year_doc = f.read().strip().split("\n")


In [187]:
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

cachedStopWords = stopwords.words('english')

[nltk_data] Downloading package punkt to /Users/zhiliyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhiliyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenize and filter the raw text.
In this part, we tokenized each line of the articles, and removed meaningless words and characters.

In [188]:

hist_10_year_doc_tok = []
for line in hist_10_year_doc:
    if len(line) != 0:
        tok_list = word_tokenize(line)
        filtered_words = [tok for tok in tok_list if tok not in CHARACTERS_TO_SPLIT
                         and tok.lower() not in cachedStopWords]
        hist_10_year_doc_tok.append(filtered_words)
hist_5_year_doc_tok = []
for line in hist_5_year_doc:
    if len(line) != 0:
        tok_list = word_tokenize(line)
        filtered_words = [tok for tok in tok_list if tok.lower() not in CHARACTERS_TO_SPLIT
                         and tok.lower() not in cachedStopWords]
        hist_5_year_doc_tok.append(filtered_words)


## Apply Word2Vec
After the two documents have been tokenized, we applied Word2Vec to calculate the similarity among the words in the two documents. What's more, we used scatter plots to display the relationships among those words.

In [189]:
from gensim.models import Word2Vec
wv = Word2Vec(hist_10_year_doc_tok, size = 200, min_count=1)
wv_5 = Word2Vec(hist_5_year_doc_tok, size = 200, min_count=1)


In [190]:
import pandas as pd
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool, LabelSet, ColumnDataSource
from bokeh.plotting import figure, show, output_notebook

# defining the chart
plot_tfidf = bp.figure(plot_width=700, plot_height=600,
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [wv[w] for w in wv.wv.vocab.keys()]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = wv.wv.vocab.keys()
#print(tsne_df['words'])
#key_words = ['POK', 'Government', 'Environment', 'terrorist', 'leader', 'GasTech']
key_words = ['POK', 'leader', 'Leader', 'Kronos', 'Protectors', 'terrorist', 'member', 'members', 'GAStech', '1997']
colors = []
size = []
for w in wv.wv.vocab.keys():
    #print(w)
    if w in key_words:
        colors.append('red')
        size.append(12)
    else:
        colors.append('blue')
        size.append(5)
    
tsne_df['colors'] = colors
tsne_df['size'] = size
# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', color = 'colors', size = 'size', source=tsne_df)
label_source = ColumnDataSource(data=dict(x = tsne_df['x'], y = tsne_df['y'],
                                         words = tsne_df['words']))
labels = LabelSet(x='x', y='y', text='words', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=label_source, text_align='center')
plot_tfidf.add_layout(labels)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 673 / 673
[t-SNE] Mean sigma: 0.004820
[t-SNE] KL divergence after 100 iterations with early exaggeration: 2.093098
[t-SNE] Error after 175 iterations: 2.093098


The plot above displays the result generated from **10 years historical document**, and the red points above represent the key words we are interested in, which are **'POK', 'leader', 'Leader', 'Kronos', 'Protectors', 'terrorist', 'member', 'members', 'GAStech' and '1997'**. There are some interesting things we can find.
For the point **'leader'**, we can find it overlaps with another point **'Henk'**, which should refer to Henk Brodogi who was one the original leaders of POK. 
What's more, for the point **'members'**, we can find a point **'Jeroen'** which should refer to 'Jeroen Karel' who was an original member in the organization.
The point **'1997'** overlaps with **'chemicals'**, which was the origin year of POK, the organization began with a bunch of grassroots who tried to address the contamination problem of the nearby river Tiskele.
However, since the dataset is not big enough, and there are still a lot of meaningless words make noise in the final results, we cannot identify the whole structure of the organization.

In [191]:
# defining the chart
plot_tfidf_5 = bp.figure(plot_width=700, plot_height=600,
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors_5 = [wv_5[w] for w in wv_5.wv.vocab.keys()]

# dimensionality reduction. converting the vectors to 2d vectors
tsne_model_5 = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v_5 = tsne_model.fit_transform(word_vectors_5)

# putting everything in a dataframe
tsne_df_5 = pd.DataFrame(tsne_w2v_5, columns=['x', 'y'])
tsne_df_5['words'] = wv_5.wv.vocab.keys()
key_words_5 = ['POK', 'leader', 'Leader', 'Kronos', 'Protectors', 'terrorist', 'member', 'members', 'GAStech']
colors_5 = []
size_5 = []
for w in wv_5.wv.vocab.keys():
    if w in key_words_5:
        colors_5.append('red')
        size_5.append(12)
    else:
        colors_5.append('blue')
        size_5.append(5)
    
tsne_df_5['colors'] = colors_5
tsne_df_5['size'] = size_5
# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf_5.scatter(x='x', y='y', color = 'colors', size = 'size', source=tsne_df_5)
label_source_5 = ColumnDataSource(data=dict(x = tsne_df_5['x'], y = tsne_df_5['y'],
                                         words = tsne_df_5['words']))
labels_5 = LabelSet(x='x', y='y', text='words', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=label_source_5, text_align='center')
plot_tfidf_5.add_layout(labels_5)
hover_5 = plot_tfidf_5.select(dict(type=HoverTool))
hover_5.tooltips={"word": "@words"}
show(plot_tfidf_5)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 738 / 738
[t-SNE] Mean sigma: 0.004782
[t-SNE] KL divergence after 100 iterations with early exaggeration: 2.059716
[t-SNE] Error after 175 iterations: 2.059716


The plot above displays the results generated from **5 years historical document**. Same with the previous one, we highlighted some key words **'POK', 'leader', 'Leader', 'Kronos', 'Protectors', 'terrorist', 'member', 'members' and 'GAStech'**.
Similarly, we can find some points around 'member' and 'leader' which identify some person's names in the organization, but we still have hard time to find the whole structure of the organization.

## Find the potential relationships among Email records
In the codes below, we applied networkx to display some potential relationships through the records of Emails.
Firstly, we read the dataset from the file **"EmployeeRecords.xlsx"**, and build graphs by the relationships of senders and receivers for each date.

In [192]:
import networkx as nx
import matplotlib.pyplot as plt
from bokeh.models import GraphRenderer, Circle, HoverTool, TapTool, BoxSelectTool, MultiLine, ColumnDataSource
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes, StaticLayoutProvider
from bokeh.palettes import Spectral4, Blues8
import datetime
import data_parser
#import excel datasets, and convert them into dataframe
employee_data_df = data_parser.parse_excel_data('./data/EmployeeRecords.xlsx', 'Employee Records')
employee_feature_names = data_parser.extract_column_names(employee_data_df)
employee_data_dic = data_parser.build_dic(employee_data_df, employee_feature_names)
employee_dic_list = data_parser.build_row_dic_list(employee_data_df, employee_feature_names)

employees_dic = data_parser.convert_dic_list_to_dic(employee_dic_list, "EmailAddress")


military_set = set()
non_military_set = set()
for employee in employee_dic_list:
    if not pd.isnull(employee['MilitaryServiceBranch']):
        military_set.add(employee['EmailAddress'].strip())
    else:
        non_military_set.add(employee['EmailAddress'].strip())
email_record_df = data_parser.parse_csv_data('./data/email_headers.csv', 'ISO-8859-1')
email_record_feature_names = data_parser.extract_column_names(email_record_df)
email_record_dic_list = data_parser.build_row_dic_list(email_record_df, email_record_feature_names)

def generate_G(data_list, date, military, military_set, re):
    #G = nx.DiGraph()
    G = nx.MultiGraph()
    for data in data_list:
        is_military = data['From'] in military_set
        is_re = "RE" in data['Subject']
        if not military: 
            is_military = True
        if not re:
            is_re = False
        is_date = date in data['Date']
        if is_date and is_military and not is_re:
            for to in data['To'].split(','):
                #print(data['From'])
                G.add_edge(data['From'], to.strip(), rel_type="send email to", details=data)
    return G
def get_date(data_list):
    date_set = set()
    for data in data_list:
        #datetime_set.add(datetime.datetime.strptime(data['Date'],"%m/%d/%Y %H:%M"))
        date_set.add(datetime.datetime.strptime(data['Date'].split(" ")[0],"%m/%d/%Y"))
    return date_set

date_list = []
for email in email_record_dic_list:
    if email['Date'].split(" ")[0] not in date_list:
        date_list.append(email['Date'].split(" ")[0])
    
G_list = []
for date in date_list:
    G_list.append(generate_G(email_record_dic_list, date, False, military_set, True))




In [193]:
from collections import Counter
import matplotlib as mpl
from matplotlib.cm import ScalarMappable
from ipywidgets import interact, widgets
import warnings
warnings.filterwarnings("ignore")

name_matrix = []
title_matrix = []
military_matrix = []
type_matrix = []
for G in G_list:
    name_list = []
    title_list = []
    military_list = []
    type_list = []
    for node in list(G.nodes()):
        if node in employees_dic:
            name_list.append(employees_dic[node]['LastName'] + ", " + employees_dic[node]['FirstName'])
            if not pd.isnull(employees_dic[node]['MilitaryServiceBranch']):
                military_list.append(employees_dic[node]['MilitaryServiceBranch'])
            else:
                military_list.append('none')
            title_list.append(employees_dic[node]['CurrentEmploymentTitle'])
            type_list.append(employees_dic[node]['CurrentEmploymentType'])
        else:
            name_list.append(node.split()[0])
            military_list.append('information not found')
            title_list.append('information not found')
            type_list.append('information not found')
    name_matrix.append(name_list)
    title_matrix.append(title_list)
    military_matrix.append(military_list)
    type_matrix.append(type_list)
    
node_matrix = [list(G.nodes()) for G in G_list]
edges_matrix = [list(G.edges()) for G in G_list]

def get_color(edges):
    c = Counter(edges)
    edge_weight_list = []
    for u, v in edges:
        edge_weight_list.append(c[u, v])

    cnorm = mpl.colors.Normalize(vmin=min(edge_weight_list),vmax=max(edge_weight_list))
    scalarMap = ScalarMappable(norm=cnorm, cmap='YlOrRd')
    color_list = []
    for weight in edge_weight_list:
        rgba = scalarMap.to_rgba(weight)
        red = int(rgba[0]*255)
        green = int(rgba[1]*255)
        blue = int(rgba[2]*255)
        color_list.append('0x{r:02x}{g:02x}{b:02x}'.format(r=red,g=green,b=blue).replace("0x", "#"))
    return color_list

color_matrix = []
for edges in edges_matrix:
    color_matrix.append(get_color(edges))

edges_start_matrix = []
edges_end_matrix = []
for edges in edges_matrix:
    edges_start_matrix.append([edge[0] for edge in edges])
    edges_end_matrix.append([edge[1] for edge in edges])

node_source = ColumnDataSource(data=dict(index = node_matrix[0], name = name_matrix[0], title = title_matrix[0],
                              military = military_matrix[0], emp_type = type_matrix[0]))
edge_source = ColumnDataSource(data=dict(
            start=edges_start_matrix[0],
            end=edges_end_matrix[0],
            color = color_matrix[0]
        ))

i = 0
sub_matrix = [[] for x in range(len(date_list))]
datetime_matrix = [[] for x in range(len(date_list))]
from_matrix = [[] for x in range(len(date_list))]
to_matrix = [[] for x in range(len(date_list))]
for email in email_record_dic_list:
    if date_list[i] in email["Date"]:
        sub_matrix[i].append(email["Subject"])
        datetime_matrix[i].append(email['Date'])
        from_matrix[i].append(email['From'])
        to_matrix[i].append(email['To'])
    else:
        i += 1

from bokeh.models.widgets import DataTable, TableColumn
from bokeh.layouts import column, widgetbox, row

plot = figure(title="Networkx Integration Demonstration", x_range=(-1.1,1.1), y_range=(-1.1,1.1), 
              tools="pan,wheel_zoom,box_zoom,reset,previewsave", output_backend="webgl")
plot.add_tools(HoverTool(tooltips=[("Name","@name"), ("Title", "@title"), ("Military", "@military"),
                                  ("Type", "@emp_type")]), TapTool(), BoxSelectTool())

#graph_renderer = from_networkx(G, nx.spring_layout, scale=2, center=(0,0))
graph_renderer = GraphRenderer()
graph_renderer.node_renderer.data_source.data = node_source.data
graph_renderer.edge_renderer.data_source.data = edge_source.data
graph_layout=nx.spring_layout(G,scale=2,center=(0,0))
graph_renderer.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph_renderer.node_renderer.glyph = Circle(size=10, fill_color=Blues8[3])
graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

graph_renderer.edge_renderer.glyph = MultiLine(line_color="color", line_alpha=0.8, line_width=2)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color="blue", line_width=2)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[3], line_width=2)

graph_renderer.selection_policy = NodesAndLinkedEdges()
#graph_renderer.inspection_policy = EdgesAndLinkedNodes()

plot.renderers.append(graph_renderer)



table_source = ColumnDataSource(data = dict(
        from_email=from_matrix[0],
        to_email=to_matrix[0],
        datetime=datetime_matrix[0],
        subject=sub_matrix[0],
    ))

columns = [
        TableColumn(field="from_email", title="From", width=30),
        TableColumn(field="to_email", title="To", width=240),
        TableColumn(field="datetime", title="Date", width=30),
        TableColumn(field="subject", title="Subject", width=120)
    ]
data_table = widgetbox(DataTable(source=table_source, columns=columns, height = 280))

layout = column(plot, data_table)
show(layout, notebook_handle = True)

date_dropdown = widgets.Dropdown(
    options = date_list,
    value = date_list[0],
    description = 'Date',
    disabled=False,
)

def update_graph(date):
    i = date_list.index(date)
    node_source.data['index'] = node_matrix[i]
    node_source.data['name'] = name_matrix[i]
    node_source.data['title'] = title_matrix[i]
    node_source.data['military'] = military_matrix[i]
    node_source.data['emp_type'] = type_matrix[i]
    edge_source.data['start'] = edges_start_matrix[i]
    edge_source.data['end'] = edges_end_matrix[i]
    edge_source.data['color'] = color_matrix[i]
    table_source.data['from_email'] = from_matrix[i]
    table_source.data['to_email'] = to_matrix[i]
    table_source.data['datetime'] = datetime_matrix[i]
    table_source.data['subject'] = sub_matrix[i]
    
    push_notebook()

interact(update_graph, date = date_dropdown)

E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: color [renderer: GlyphRenderer(id='c9610539-3023-4845-ba9b-56729507dd9d', ...)]


A Jupyter Widget

<function __main__.update_graph>

The graph above display the relationships among different email addresses on each day, the table displays the detailed information of the emails for the selected date, you can change the date by the dropdown widget. **However, there are a problem with the interation, you will find it takes relatively long time to render a graph when you select a specific date**.
Each node in the graph represents an email address, you can find the corresponding information of each email address through hover. The edges represent the relationships of senders and receivers, the color of edges represents the frequency of emails sent between two nodes, **higher frequency with deeper color**. **What's more, when you select a node, the plot would display all related nodes and edges in the graph**.
We tried to use the graph to find the co-conspirators in the company, but without success.

In [105]:
import io, os, re
from datetime import date, time, datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from bokeh.io import push_notebook, output_notebook, show
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, CustomJS
from bokeh.models.widgets import DataTable, TableColumn, Paragraph, PreText, TextInput
from bokeh.layouts import column, widgetbox, row
from bokeh.palettes import d3

from ipywidgets import interact, widgets

import pandas as pd

# Article Analysis
In following parts, we will analyze those articles and try to find the answers to Mini Challenge 1.

## 1. Text Structure Analyzing and Partition 
At first, we should know how those article like, how can I transform them in same structure and then visualize them.
By scan several raw articles text, we find that almost each article contain these 4 elements: Publisher, Title, Date, Content. And some articles (actually a lot) contain the reporting time of the article, which will be necessary to help us answer MC1.2. So we should extrac those 5 elements from articles.
Use regular expression to extract DATE and TIME is the major job for this part.
**NOTICE: Several articles with irregular format was edited by hand!**

In [106]:
file_names = os.listdir("data/articles/")
file_names.sort(key = lambda x: int(x.split(".")[0]))
article_idx = []
article_pub = []
article_title = []
article_date = []
article_time = []
article_content = []

pat1 = re.compile(r"\d{4}/\d{1,2}/\d{1,2}")
pat2 = re.compile(r"\d{1,2}\s\w{3,9}\s\d{4}")
pat3 = re.compile(r"([1-9]|1[0-2]):([0-5][0-9])\s(\wM)")
pat4 = re.compile(r"([0-1][0-9]|2[0-3])([0-5][0-9])")
month = ["January", "February", "March", "April", "May", "June", 
         "July", "August", "September", "October", "November", "December"]
for fname in file_names:
    with io.open("data/articles/" + fname,'r', encoding = "ISO-8859-1") as f:
        article = f.read()
        article = article.split('\n')
        article = list(filter(lambda a: a != '', article))
        article_idx.append(int(fname.strip('.txt')))
        article_pub.append(article[0])
        article_title.append(article[1])
        if pat1.search(article[2]):
            tag = 2
        elif pat1.search(article[3]):
            tag = 3
        elif pat2.search(article[2]):       
            tag = 12
        elif pat2.search(article[3]):
            tag = 13
        
        content = ""
        for i in range(tag%10 + 1, len(article)):
            content += " " + article[i]
        article_content.append(content)
        
        pr3 = pat3.findall(content)
        pr4 = pat4.findall(content)
        if len(pr3)+len(pr4)>0:
            if len(pr3)>0:
                H = int(pr3[0][0])
                M = int(pr3[0][1])
                if pr3[0][2] == 'PM' and H<12:
                    H += 12
            elif len(pr4)>0:
                H = int(pr4[0][0])
                M = int(pr4[0][0])
            if tag<10:
                date_list = article[tag].split('/')
                article_date.append(date(int(date_list[0]), int(date_list[1]), int(date_list[2])))
            else:
                date_list = article[tag%10].split(' ')
                article_date.append(date(int(date_list[2]), month.index(date_list[1])+1, int(date_list[0])))
            article_time.append(time(H, M))
        else:
            if tag<10:
                date_list = article[tag].split('/')
                article_date.append(date(int(date_list[0]), int(date_list[1]), int(date_list[2])))
            else:
                date_list = article[tag%10].split(' ')
                article_date.append(date(int(date_list[2]), month.index(date_list[1])+1, int(date_list[0])))
            article_time.append(time(6, 0))

## TF-IDF Calculating 
We use TF-IDF to clutering those articles and to find most important words for each cluster after clustering. Here we go.

In [107]:
def get_tfidf(article_content):
    count_vect = CountVectorizer(stop_words = 'english', analyzer = 'word')
    article_content_tf = count_vect.fit_transform(article_content)
    feat_name = count_vect.get_feature_names()
    tf_transformer = TfidfTransformer().fit(article_content_tf)
    article_content_tfidf = tf_transformer.transform(article_content_tf)
    return feat_name, article_content_tfidf

count_vect = CountVectorizer(stop_words = 'english', analyzer = 'word', max_df=0.2, min_df=0.01)
article_content_tf = count_vect.fit_transform(article_content)
feat_name = count_vect.get_feature_names()
tf_transformer = TfidfTransformer().fit(article_content_tf)
article_content_tfidf = tf_transformer.transform(article_content_tf)


## Clustering Algorithm
Use machine learning methods from sklearn library including **KMeans**, **Spectral**, **AgglomerativeClustering**.

In [108]:
num_clusters = 10
from sklearn.cluster import KMeans
km = KMeans(n_clusters = num_clusters)
km.fit(article_content_tfidf)

from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters = num_clusters)
sc.fit(article_content_tfidf)

from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters = num_clusters)
ac.fit(article_content_tfidf.toarray())

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward',
            memory=Memory(cachedir=None), n_clusters=10,
            pooling_func=<function mean at 0x110bf9d90>)

## Cluster Merging 
Merge all articles' words from one cluster and extract most important words based on TD-IDF.

In [109]:
def merge_cluster_contents(model, song_list):
    cluster_size = [0 for i in range(num_clusters)]
    dic_model = ["" for i in range(num_clusters)]
    for song, label in zip(song_list, model.labels_):
        dic_model[label] += song
        cluster_size[label] += 1
    return cluster_size, dic_model

km_cluster_size, km_content = merge_cluster_contents(km, article_content)
sc_cluster_size, sc_content = merge_cluster_contents(sc, article_content)
ac_cluster_size, ac_content = merge_cluster_contents(ac, article_content)
cluster_size_list = []
cluster_size_list.append(km_cluster_size)
cluster_size_list.append(sc_cluster_size)
cluster_size_list.append(ac_cluster_size)
#print(km_cluster_size)

km_feat_name, km_content_tfidf = get_tfidf(km_content)
sc_feat_name, sc_content_tfidf = get_tfidf(sc_content)
ac_feat_name, ac_content_tfidf = get_tfidf(ac_content)


def get_top_words(feat_name, article_content_tfidf, top):
    article_tfidf_list = article_content_tfidf.todense().tolist()
    top_words_list = []
    for cluster_tf_idf in article_tfidf_list:
        tuple_list = sorted(zip(feat_name, cluster_tf_idf), key=lambda x:x[1], reverse=True)[:top]
        top_words_list.append(", ".join([t[0] for t in tuple_list]))
    return top_words_list

top = 10
km_top_words = get_top_words(km_feat_name, km_content_tfidf, top)
sc_top_words = get_top_words(sc_feat_name, sc_content_tfidf, top)
ac_top_words = get_top_words(ac_feat_name, ac_content_tfidf, top)
top_words_list = []
top_words_list.append(km_top_words)
top_words_list.append(sc_top_words)
top_words_list.append(ac_top_words)

## Dimensionality Reduction
Use t-distributed stochastic neighbor embedding in sklearn library to reduce the dimension. Reduce the dimension at the number of words to 2 dimension for better plotting.

In [110]:
tsne_init = 'pca'  
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000
random_state = 1
model = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
         early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

dist = 1 - cosine_similarity(article_content_tfidf)
transformed_tfidf = model.fit_transform(dist)

xs = transformed_tfidf[:, 0]
ys = transformed_tfidf[:, 1]

## Plotting the Clustering Result
### 1. Prepare for DataSource

In [111]:
#print(artist_list)
data_cluster = {}
data_cluster['x'] = xs
data_cluster['y'] = ys
data_cluster['index'] = article_idx
data_cluster['publisher'] = article_pub
data_cluster['title'] = article_title
data_cluster['date'] = article_date
data_cluster['time'] = article_time
data_cluster['content'] = article_content
color_list = d3['Category10'][num_clusters]

color_list_algo = []
color_list_algo.append([color_list[label] for label in km.labels_])
color_list_algo.append([color_list[label] for label in sc.labels_])
color_list_algo.append([color_list[label] for label in ac.labels_])

legend_list_algo = []
legend_list_algo.append(["Cluster " + str(label) for label in km.labels_])
legend_list_algo.append(["Cluster " + str(label) for label in sc.labels_])
legend_list_algo.append(["Cluster " + str(label) for label in ac.labels_])

data_cluster['color'] = color_list_algo[0]
data_cluster['legend'] = legend_list_algo[0]

source = ColumnDataSource(data=data_cluster)

## Plotting the Clustering Result
### 2. Plotting and Interaction
From the clustering plotting we can see that our dimensionality reduction works well, since all small cluster are made up of articles with high similarity(Almost same title!) And I think the agglomarative clustering got a better result because it almost cover all the articles in Jan. 20 and 21 in one cluster, and those articles tend to have higher similarity since their topics concerning the GAStech incident.

In [112]:
# add hover tool
hover = HoverTool(tooltips=[('File','@index'),('Paper','@publisher'),('Title','@title'),("Date","@date{%F}")], formatters={'date':'datetime'})

plot1 = figure(plot_width=900, plot_height=600, tools = [hover,"pan,wheel_zoom,box_zoom,reset,previewsave"])
plot1.background_fill_color = "lightgrey"
plot1.background_fill_alpha = 0.2
plot1.circle('x' ,'y' ,source=source, size=8, color="color", alpha=0.5, legend="legend")

data = dict(cluster=["Cluster " + str(i) for i in range(num_clusters)],
            topwords=top_words_list[0])
table_source = ColumnDataSource(data)
columns = [ TableColumn(field="cluster", title="Cluster", width=50),
            TableColumn(field="topwords", title="Most important words", width=300)]
data_table = widgetbox(DataTable(source=table_source, columns=columns, width=850, height = 280))

layout = column(column(plot1, data_table))
show(layout, notebook_handle = True)

algorithm_list = ["KMeans", "Spectral Clustering", "Agglomerative Clustering"]

algorithms_dropdown = widgets.Dropdown(
    options = algorithm_list,
    value = algorithm_list[0],
    description = 'Algorithm',
    disabled=False,
)

def update_plot(algo):
    index = algorithm_list.index(algo)
    source.data['color'] = color_list_algo[index]
    source.data['legend'] = legend_list_algo[index]
    table_source.data['topwords'] = top_words_list[index]
    #cluster_size_source.data['top'] = cluster_size_list[index]
    push_notebook()

interact(update_plot, algo = algorithms_dropdown)

A Jupyter Widget

<function __main__.update_plot>

## Plotting the Sequence of Article Publishing Time
### 1. Prepare for DataSource
That's for Mini Challenge 1.2, to get detailed knowledge of what happened in selected days (Jan. 20, Jan. 21).

In [113]:
data_cluster["label"] = ac.labels_
data_cluster['color'] = color_list_algo[2]
data_cluster['legend'] = legend_list_algo[2]
df_givendays = pd.DataFrame(data=data_cluster)
df_givendays = df_givendays.sort_values('date')
df_givendays = df_givendays[df_givendays.date > date(2014,1,19)]
df_givendays = df_givendays[df_givendays.date < date(2014,1,22)]

data_givendays = df_givendays.to_dict('list')
dtime = []
for i in range(len(data_givendays["date"])):
    dtime.append(datetime.combine(data_givendays["date"][i], data_givendays["time"][i]))
data_givendays["dtime"] = dtime
#print dtime
source2 = ColumnDataSource(data=data_givendays)

## Plotting the Sequence of Article Publishing Time
### 2. Plotting and Interaction
So we can get the title and index of articles published in time series using hover tool. And then input the index of article we want to view in the article index input filed to load article.

In [114]:
# add hover tool
hover = HoverTool(tooltips=[('File','@index'),('Publisher', '@publisher'),('Title', '@title')])

plot2 = figure(plot_width=900, plot_height=400, x_axis_type='datetime', y_range=[0,3], tools = [hover,"pan,wheel_zoom,box_zoom,reset,previewsave"])
plot2.background_fill_color = "lightgrey"
plot2.background_fill_alpha = 0.2
plot2.yaxis.visible = False
plot2.circle('dtime' ,'label' ,source=source2, size=8, color="color", alpha=0.5, legend="legend")
plot2.ray(x=dtime, y=[-10 for i in range(len(dtime))], length=0, angle=[90 for i in range(len(dtime))], color="purple", angle_units="deg", line_width=1, alpha=0.2)
show((plot2), notebook_handle = True)

data_text = dict(zip(data_cluster['index'], data_cluster['content']))

def update_text(num):
    print(data_text[num])

num_input = widgets.BoundedIntText(value=682, min=0, max=844, step=1, description='Article Index:', disabled=False)
interact(update_text, num=num_input)

A Jupyter Widget

<function __main__.update_text>