In [32]:
# http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

# SchemeLens: A Content-Aware Vector-Based Fisheye Technique for Navigating Large Systems Diagrams
document1 = tb("""System schematics, such as those used for electrical or
hydraulic systems, can be large and complex. Fisheye techniques can help 
navigate such large documents by maintaining the context around a focus 
region, but the distortion introduced by traditional fisheye techniques 
can impair the readability of the diagram. We present SchemeLens, a 
vector-based, topology-aware fisheye technique which aims to maintain 
the readability of the diagram. Vector-based scaling reduces distortion 
to components, but distorts layout. We present several strategies to 
educe this distortion by using the structure of the topology, including 
orthogonality and alignment, and a model of user intention to foster 
smooth and predictable navigation. We evaluate this approach through two 
user studies: Results show that (1) SchemeLens is 16-27% faster than both 
round and rectangular flat-top fisheye lenses at finding and identifying 
a target along one or several paths in a network diagram; (2) augmenting 
SchemeLens with a model of user intentions aids in learning the network 
topology.""")

# A Visual Voting Framework for Weather Forecast Calibration
document2 = tb("""Numerical weather predictions have been widely used for
weather forecasting. Many large meteorological centers are producing 
highly accurate ensemble forecasts routinely to provide effective weather 
forecast services. However, biases frequently exist in forecast products 
because of various reasons, such as the imperfection of the weather 
forecast models. Failure to identify and neutralize the biases would 
result in unreliable forecast products that might mislead analysts; 
consequently, unreliable weather predictions are produced. The analog 
method has been commonly used to overcome the biases. Nevertheless, this 
method has some serious limitations including the difficulties in finding 
effective similar past forecasts, the large search space for proper 
parameters and the lack of support for interactive, real-time analysis. 
In this study, we develop a visual analytics system based on a novel 
voting framework to circumvent the problems. The framework adopts the 
idea of majority voting to combine judiciously the different variants of 
analog methods towards effective retrieval of the proper analogs for 
calibration. The system seamlessly integrates the analog methods into an 
interactive visualization pipeline with a set of coordinated views that 
characterizes the different methods. Instant visual hints are provided 
in the views to guide users in finding and refining analogs. We have 
worked closely with the domain experts in the meteorological research to 
develop the system. The effectiveness of the system is demonstrated using 
two case studies. An informal evaluation with the experts proves the 
usability and usefulness of the system.""")

# MobilityGraphs: Visual Analysis of Mass Mobility Dynamics via Spatio-Temporal Graphs and Clustering
document3 = tb("""Learning more about people mobility is an important 
task for official decision makers and urban planners. Mobility data sets 
characterize the variation of the presence of people in different places 
over time as well as movements (or flows) of people between the places. 
The analysis of mobility data is challenging due to the need to analyze 
and compare spatial situations (i.e., presence and flows of people at 
certain time moments) and to gain an understanding of the spatio-temporal 
changes (variations of situations over time). Traditional flow 
visualizations usually fail due to massive clutter. Modern approaches 
offer limited support for investigating the complex variation of the 
movements over longer time periods. We propose a visual analytics 
methodology that solves these issues by combined spatial and temporal 
simplifications. We have developed a graph-based method, called 
MobilityGraphs, which reveals movement patterns that were occluded in 
flow maps. Our method enables the visual representation of the 
spatio-temporal variation of movements for long time series of spatial 
situations originally containing a large number of intersecting flows. 
The interactive system supports data exploration from various 
perspectives and at various levels of detail by interactive setting of 
clustering parameters. The feasibility our approach was tested on 
aggregated mobility data derived from a set of geolocated Twitter posts 
within the Greater London city area and mobile phone call data records 
in Abidjan, Ivory Coast. We could show that MobilityGraphs support the 
identification of regular daily and weekly movement patterns of resident 
population.""")

bloblist = [document1, document2, document3]
# for i, blob in enumerate(bloblist):
#     print("Top words in document {}".format(i + 1))
#     scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
#     sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#     for word, score in sorted_words[:3]:
#         print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
scores = {word: tfidf(word, bloblist[0], bloblist) for word in bloblist[0].words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:3]:
    print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

	Word: fisheye, TF-IDF: 0.01026
	Word: Fisheye, TF-IDF: 0.01026
	Word: distortion, TF-IDF: 0.0077


In [38]:
import pymysql.cursors
from collections import Counter
import pprint
connection = pymysql.connect(host='127.0.0.1',
                             user='ieeevis',
                             password='ieeevis',
                             db='ieeevis',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
results = []

d = open('./tfidf_keywords', 'w')

with connection.cursor() as cursor:
    # Read a single record
    sql = "SELECT `keyword` FROM `keyword`"
    cursor.execute(sql, ())
    result = cursor.fetchall()
    for x in result:
        results.append(tb(x['keyword']))
for zzz in results:
    keywords = ""
    tf_score = ""
    scores = {word: tfidf(word, zzz, results) for word in zzz.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    for word, score in sorted_words:
#         print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
        keywords+=word + ","
        tf_score+=str(round(score, 5)) + ","
        
    d.write(keywords)
    d.write("\t")
    d.write(tf_score)
    d.write("\n")

    
d.close()