In [30]:
import io, os, re
from datetime import date, time, datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from bokeh.io import push_notebook, output_notebook, show
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, CustomJS
from bokeh.models.widgets import DataTable, TableColumn, Paragraph, PreText, TextInput
from bokeh.layouts import column, widgetbox, row
from bokeh.palettes import d3

from ipywidgets import interact, widgets

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [31]:
output_notebook()

# MC1.2: Article Analysis
In following parts, we will analyze those articles and try to find the answers to Mini Challenge 1.

## 1. Text Structure Analyzing and Partition 
At first, we should know how those article like, how can I transform them in same structure and then visualize them.
By scan several raw articles text, we find that almost each article contain these 4 elements: Publisher, Title, Date, Content. And some articles (actually a lot) contain the reporting time of the article, which will be necessary to help us answer MC1.2. So we should extrac those 5 elements from articles.
Use regular expression to extract DATE and TIME is the major job for this part.
**NOTICE: Several articles with irregular format was edited by hand!**

In [32]:
file_names = os.listdir("data/articles/")
file_names.sort(key = lambda x: int(x.split(".")[0]))
article_idx = []
article_pub = []
article_title = []
article_date = []
article_time = []
article_content = []

pat1 = re.compile(r"\d{4}/\d{1,2}/\d{1,2}")
pat2 = re.compile(r"\d{1,2}\s\w{3,9}\s\d{4}")
pat3 = re.compile(r"([1-9]|1[0-2]):([0-5][0-9])\s(\wM)")
pat4 = re.compile(r"([0-1][0-9]|2[0-3])([0-5][0-9])")
month = ["January", "February", "March", "April", "May", "June", 
         "July", "August", "September", "October", "November", "December"]
for fname in file_names:
    with io.open("data/articles/" + fname,'r', encoding = "ISO-8859-1") as f:
        article = f.read()
        article = article.split('\n')
        article = list(filter(lambda a: a != '', article))
        article_idx.append(int(fname.strip('.txt')))
        article_pub.append(article[0])
        article_title.append(article[1])
        if pat1.search(article[2]):
            tag = 2
        elif pat1.search(article[3]):
            tag = 3
        elif pat2.search(article[2]):       
            tag = 12
        elif pat2.search(article[3]):
            tag = 13
        
        content = ""
        for i in range(tag%10 + 1, len(article)):
            content += " " + article[i]
        article_content.append(content)
        
        pr3 = pat3.findall(content)
        pr4 = pat4.findall(content)
        if len(pr3)+len(pr4)>0:
            if len(pr3)>0:
                H = int(pr3[0][0])
                M = int(pr3[0][1])
                if pr3[0][2] == 'PM' and H<12:
                    H += 12
            elif len(pr4)>0:
                H = int(pr4[0][0])
                M = int(pr4[0][0])
            if tag<10:
                date_list = article[tag].split('/')
                article_date.append(date(int(date_list[0]), int(date_list[1]), int(date_list[2])))
            else:
                date_list = article[tag%10].split(' ')
                article_date.append(date(int(date_list[2]), month.index(date_list[1])+1, int(date_list[0])))
            article_time.append(time(H, M))
        else:
            if tag<10:
                date_list = article[tag].split('/')
                article_date.append(date(int(date_list[0]), int(date_list[1]), int(date_list[2])))
            else:
                date_list = article[tag%10].split(' ')
                article_date.append(date(int(date_list[2]), month.index(date_list[1])+1, int(date_list[0])))
            article_time.append(time(6, 0))

## TF-IDF Calculating 
We use TF-IDF to clutering those articles and to find most important words for each cluster after clustering. Here we go.

In [33]:
def get_tfidf(article_content):
    count_vect = CountVectorizer(stop_words = 'english', analyzer = 'word')
    article_content_tf = count_vect.fit_transform(article_content)
    feat_name = count_vect.get_feature_names()
    tf_transformer = TfidfTransformer().fit(article_content_tf)
    article_content_tfidf = tf_transformer.transform(article_content_tf)
    return feat_name, article_content_tfidf

count_vect = CountVectorizer(stop_words = 'english', analyzer = 'word', max_df=0.2, min_df=0.01)
article_content_tf = count_vect.fit_transform(article_content)
feat_name = count_vect.get_feature_names()
tf_transformer = TfidfTransformer().fit(article_content_tf)
article_content_tfidf = tf_transformer.transform(article_content_tf)
#print article_content_tfidf.shape
#print feat_name

## Clustering Algorithm
Use machine learning methods from sklearn library including **KMeans**, **Spectral**, **AgglomerativeClustering**.

In [34]:
num_clusters = 10
from sklearn.cluster import KMeans
km = KMeans(n_clusters = num_clusters)
km.fit(article_content_tfidf)

from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters = num_clusters)
sc.fit(article_content_tfidf)

from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters = num_clusters)
ac.fit(article_content_tfidf.toarray())

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=10,
            pooling_func=<function mean at 0x107761a60>)

## Cluster Merging 
Merge all articles' words from one cluster and extract most important words based on TD-IDF.

In [35]:
def merge_cluster_contents(model, song_list):
    cluster_size = [0 for i in range(num_clusters)]
    dic_model = ["" for i in range(num_clusters)]
    for song, label in zip(song_list, model.labels_):
        dic_model[label] += song
        cluster_size[label] += 1
    return cluster_size, dic_model

km_cluster_size, km_content = merge_cluster_contents(km, article_content)
sc_cluster_size, sc_content = merge_cluster_contents(sc, article_content)
ac_cluster_size, ac_content = merge_cluster_contents(ac, article_content)
cluster_size_list = []
cluster_size_list.append(km_cluster_size)
cluster_size_list.append(sc_cluster_size)
cluster_size_list.append(ac_cluster_size)
#print(km_cluster_size)

km_feat_name, km_content_tfidf = get_tfidf(km_content)
sc_feat_name, sc_content_tfidf = get_tfidf(sc_content)
ac_feat_name, ac_content_tfidf = get_tfidf(ac_content)


def get_top_words(feat_name, article_content_tfidf, top):
    article_tfidf_list = article_content_tfidf.todense().tolist()
    top_words_list = []
    for cluster_tf_idf in article_tfidf_list:
        tuple_list = sorted(zip(feat_name, cluster_tf_idf), key=lambda x:x[1], reverse=True)[:top]
        top_words_list.append(", ".join([t[0] for t in tuple_list]))
    return top_words_list

top = 10
km_top_words = get_top_words(km_feat_name, km_content_tfidf, top)
sc_top_words = get_top_words(sc_feat_name, sc_content_tfidf, top)
ac_top_words = get_top_words(ac_feat_name, ac_content_tfidf, top)
top_words_list = []
top_words_list.append(km_top_words)
top_words_list.append(sc_top_words)
top_words_list.append(ac_top_words)

## Dimensionality Reduction
Use t-distributed stochastic neighbor embedding in sklearn library to reduce the dimension. Reduce the dimension at the number of words to 2 dimension for better plotting.

In [36]:
tsne_init = 'pca'  
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000
random_state = 1
model = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
         early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

dist = 1 - cosine_similarity(article_content_tfidf)
transformed_tfidf = model.fit_transform(dist)

xs = transformed_tfidf[:, 0]
ys = transformed_tfidf[:, 1]

## Plotting the Clustering Result
### 1. Prepare for DataSource

In [37]:
#print(artist_list)
data_cluster = {}
data_cluster['x'] = xs
data_cluster['y'] = ys
data_cluster['index'] = article_idx
data_cluster['publisher'] = article_pub
data_cluster['title'] = article_title
data_cluster['date'] = article_date
data_cluster['time'] = article_time
data_cluster['content'] = article_content
color_list = d3['Category10'][num_clusters]

color_list_algo = []
color_list_algo.append([color_list[label] for label in km.labels_])
color_list_algo.append([color_list[label] for label in sc.labels_])
color_list_algo.append([color_list[label] for label in ac.labels_])

legend_list_algo = []
legend_list_algo.append(["Cluster " + str(label) for label in km.labels_])
legend_list_algo.append(["Cluster " + str(label) for label in sc.labels_])
legend_list_algo.append(["Cluster " + str(label) for label in ac.labels_])

data_cluster['color'] = color_list_algo[0]
data_cluster['legend'] = legend_list_algo[0]

source = ColumnDataSource(data=data_cluster)

## Plotting the Clustering Result
### 2. Plotting and Interaction
From the clustering plotting we can see that our dimensionality reduction works well, since all small cluster are made up of articles with high similarity(Almost same title!) And I think the agglomarative clustering got a better result because it almost cover all the articles in Jan. 20 and 21 in one cluster, and those articles tend to have higher similarity since their topics concerning the GAStech incident.

In [44]:
# add hover tool
hover = HoverTool(tooltips=[('File','@index'),('Paper','@publisher'),('Title','@title'),("Date","@date{%F}")], formatters={'date':'datetime'})

plot1 = figure(plot_width=900, plot_height=600, tools = [hover,"pan,wheel_zoom,box_zoom,reset,previewsave"])
plot1.background_fill_color = "lightgrey"
plot1.background_fill_alpha = 0.2
plot1.circle('x' ,'y' ,source=source, size=8, color="color", alpha=0.5, legend="legend")

data = dict(cluster=["Cluster " + str(i) for i in range(num_clusters)],
            topwords=top_words_list[0])
table_source_1 = ColumnDataSource(data)
columns = [ TableColumn(field="cluster", title="Cluster", width=50),
            TableColumn(field="topwords", title="Most important words", width=300)]
data_table_1 = widgetbox(DataTable(source=table_source_1, columns=columns, width=850, height = 280))

layout = column(column(plot1, data_table_1))
show(layout, notebook_handle = True)

algorithm_list = ["KMeans", "Spectral Clustering", "Agglomerative Clustering"]

algorithms_dropdown = widgets.Dropdown(
    options = algorithm_list,
    value = algorithm_list[0],
    description = 'Algorithm',
    disabled=False,
)

def update_plot(algo):
    index = algorithm_list.index(algo)
    source.data['color'] = color_list_algo[index]
    source.data['legend'] = legend_list_algo[index]
    table_source_1.data['topwords'] = top_words_list[index]
    #cluster_size_source.data['top'] = cluster_size_list[index]
    push_notebook()

interact(update_plot, algo = algorithms_dropdown)

<function __main__.update_plot>

## Plotting the Sequence of Article Publishing Time
### 1. Prepare for DataSource
That's for Mini Challenge 1.2, to get detailed knowledge of what happened in selected days (Jan. 20, Jan. 21).

In [39]:
data_cluster["label"] = ac.labels_
data_cluster['color'] = color_list_algo[2]
data_cluster['legend'] = legend_list_algo[2]
df_givendays = pd.DataFrame(data=data_cluster)
df_givendays = df_givendays.sort_values('date')
df_givendays = df_givendays[df_givendays.date > date(2014,1,19)]
df_givendays = df_givendays[df_givendays.date < date(2014,1,22)]

data_givendays = df_givendays.to_dict('list')
dtime = []
for i in range(len(data_givendays["date"])):
    dtime.append(datetime.combine(data_givendays["date"][i], data_givendays["time"][i]))
data_givendays["dtime"] = dtime
#print dtime
source2 = ColumnDataSource(data=data_givendays)

## Plotting the Sequence of Article Publishing Time
### 2. Plotting and Interaction
So we can get the title and index of articles published in time series using hover tool. And then input the index of article we want to view in the article index input filed to load article.

In [45]:
# add hover tool
hover = HoverTool(tooltips=[('File','@index'),('Publisher', '@publisher'),('Title', '@title')])

plot2 = figure(plot_width=900, plot_height=400, x_axis_type='datetime', y_range=[0,3], tools = [hover,"pan,wheel_zoom,box_zoom,reset,previewsave"])
plot2.background_fill_color = "lightgrey"
plot2.background_fill_alpha = 0.2
plot2.yaxis.visible = False
plot2.circle('dtime' ,'label' ,source=source2, size=8, color="color", alpha=0.5, legend="legend")
plot2.ray(x=dtime, y=[-10 for i in range(len(dtime))], length=0, angle=[90 for i in range(len(dtime))], color="purple", angle_units="deg", line_width=1, alpha=0.2)
show((plot2), notebook_handle = True)

data_text = dict(zip(data_cluster['index'], data_cluster['content']))

def update_text(num):
    print(data_text[num])

num_input = widgets.BoundedIntText(value=682, min=0, max=844, step=1, description='Article Index:', disabled=False)
interact(update_text, num=num_input)

<function __main__.update_text>

# MC2.1 Routine Analyze


In [41]:
def getSource(df, idx, day):
    delta = pd.Timedelta('0 day 00:00:30')
    df = df.loc[(df['Timestamp'] >= pd.Timestamp('2014-01-0'+str(day)+' 00:00:00')) & (df['Timestamp'] < pd.Timestamp('2014-01-0'+str(day+1)+' 00:00:00')) & (df['id'] == idx)]
    if len(df) == 0:
        tmp_source = dict(zip(['Timestamp', 'id', 'lat', 'long'],[[],[],[],[]]))
        return ColumnDataSource(tmp_source)
    cf = df.iloc[0]
    for i in range(len(df)):
        if(df.iloc[i]['Timestamp'] - cf.tail(4)['Timestamp'] >= delta):
            cf = pd.concat([cf, df.iloc[i]])
    lat = [6106.8595736361885*i-220122.2248109374 for i in cf['lat'].tolist()]
    long = [6302.5075554252035*i-156452.8192016015 for i in cf['long'].tolist()]
    source = dict(zip(['Timestamp', 'id', 'lat', 'long'], [cf['Timestamp'].tolist(), cf['id'].tolist(), lat, long]))
    return ColumnDataSource(source)

In [42]:
df = pd.read_csv('data2/gps.csv')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%m/%d/%Y %H:%M:%S')
allSource = {}
car_list = [str(k) for k in range(1,16)]
day_list = [str(k) for k in range(6,20)]
for i in car_list:
    for j in day_list:
        key = i + '-' + j
        allSource[key] = getSource(df, int(i), int(j))

In [46]:
plot3 = figure(plot_width=900, plot_height=505, x_range=(0,548), y_range=(0,320))
plot3.image_url(url=['data2/MC2-tourist.jpg'], w=548, h=307, x=0, y=0, anchor="bottom_left")

car_list = [str(k) for k in range(1,16)]
day_list = [str(k) for k in range(6,20)]

mc2_source = ColumnDataSource(allSource['1-6'].data)

plot3.circle('long','lat',source = mc2_source, color = 'red')
plot3.line('long','lat',source = mc2_source, color = 'red')
show((plot3), notebook_handle = True)

car_dropdown = widgets.Dropdown(
    options = car_list,
    value = car_list[0],
    description = 'Car ID',
    disabled=False,
)

day_dropdown = widgets.Dropdown(
    options = day_list,
    value = day_list[0],
    description = 'Date',
    disabled=False,
)

def update_plot2(car, day):
    key = car + '-' + day
    mc2_source.data['Timestamp'] = allSource[key].data['Timestamp']
    mc2_source.data['id'] = allSource[key].data['id']
    mc2_source.data['lat'] = allSource[key].data['lat']
    mc2_source.data['long'] = allSource[key].data['long']
    push_notebook()

interact(update_plot2, car = car_dropdown, day = day_dropdown)

<function __main__.update_plot2>