In [1]:
import pandas as pd
import numpy as np
import altair as alt
import nltk
import re
import string

In [2]:
from sklearn.cluster import KMeans
import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer
from gensim.matutils import corpus2dense, corpus2csc

In [3]:
df = pd.read_csv('./IEEE VIS papers 1990-2018 - Main dataset.csv')

# Data Preprocessing and Cleaning

In [4]:
# Remove missing values

df = df.dropna(subset=['Abstract'])

In [5]:
# Turn text into lowercase

df['Abstract'] = [abstract.lower() for abstract in df['Abstract']]
df['Abstract'] = [abstract[:-19] for abstract in df['Abstract']]
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed. critical points are located and characterized in a two-dimensional domain, which may be either a two-dimensional flow field or the tangential velocity field near a three-dimensional body. tangent curves are then integrated out along the principal directions of certain classes of critical points. the points and curves are linked to form a skeleton representing the two-dimensional vector field topology. when generated from the tangential velocity field near a body in a three-dimensional flow, the skeleton includes the critical points and curves which provide a basis for analyzing the three-dimensional structure of the flow separation.'

In [6]:
# Remove punctuation

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

df['Abstract'] = df['Abstract'].apply(lambda x: remove_punct(x))
df.iloc[0]['Abstract']

'the use of critical point analysis to generate representations of the vector field topology of numerical flow data sets is discussed critical points are located and characterized in a twodimensional domain which may be either a twodimensional flow field or the tangential velocity field near a threedimensional body tangent curves are then integrated out along the principal directions of certain classes of critical points the points and curves are linked to form a skeleton representing the twodimensional vector field topology when generated from the tangential velocity field near a body in a threedimensional flow the skeleton includes the critical points and curves which provide a basis for analyzing the threedimensional structure of the flow separation'

In [7]:
# Tokenize text so that the words in text are separated by commas

def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

df['Abstract'] = df['Abstract'].apply(lambda x: tokenize(x))
df.iloc[0]['Abstract']

['the',
 'use',
 'of',
 'critical',
 'point',
 'analysis',
 'to',
 'generate',
 'representations',
 'of',
 'the',
 'vector',
 'field',
 'topology',
 'of',
 'numerical',
 'flow',
 'data',
 'sets',
 'is',
 'discussed',
 'critical',
 'points',
 'are',
 'located',
 'and',
 'characterized',
 'in',
 'a',
 'twodimensional',
 'domain',
 'which',
 'may',
 'be',
 'either',
 'a',
 'twodimensional',
 'flow',
 'field',
 'or',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'are',
 'then',
 'integrated',
 'out',
 'along',
 'the',
 'principal',
 'directions',
 'of',
 'certain',
 'classes',
 'of',
 'critical',
 'points',
 'the',
 'points',
 'and',
 'curves',
 'are',
 'linked',
 'to',
 'form',
 'a',
 'skeleton',
 'representing',
 'the',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'when',
 'generated',
 'from',
 'the',
 'tangential',
 'velocity',
 'field',
 'near',
 'a',
 'body',
 'in',
 'a',
 'threedimensional',
 'flow',
 'the

In [8]:
# Remove stopwords (words that add little value to sentence)

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: remove_stopwords(x))
df.iloc[0]['Abstract']

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representations',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'sets',
 'discussed',
 'critical',
 'points',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curves',
 'integrated',
 'along',
 'principal',
 'directions',
 'certain',
 'classes',
 'critical',
 'points',
 'points',
 'curves',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'points',
 'curves',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [9]:
# Lemmatize words

wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: lemmatizing(x))
df.iloc[0]['Abstract'] 

['use',
 'critical',
 'point',
 'analysis',
 'generate',
 'representation',
 'vector',
 'field',
 'topology',
 'numerical',
 'flow',
 'data',
 'set',
 'discussed',
 'critical',
 'point',
 'located',
 'characterized',
 'twodimensional',
 'domain',
 'may',
 'either',
 'twodimensional',
 'flow',
 'field',
 'tangential',
 'velocity',
 'field',
 'near',
 'threedimensional',
 'body',
 'tangent',
 'curve',
 'integrated',
 'along',
 'principal',
 'direction',
 'certain',
 'class',
 'critical',
 'point',
 'point',
 'curve',
 'linked',
 'form',
 'skeleton',
 'representing',
 'twodimensional',
 'vector',
 'field',
 'topology',
 'generated',
 'tangential',
 'velocity',
 'field',
 'near',
 'body',
 'threedimensional',
 'flow',
 'skeleton',
 'includes',
 'critical',
 'point',
 'curve',
 'provide',
 'basis',
 'analyzing',
 'threedimensional',
 'structure',
 'flow',
 'separation']

In [10]:
# Stem words

ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

df['Abstract'] = df['Abstract'].apply(lambda x: stemming(x))
df.iloc[0]['Abstract']

['use',
 'critic',
 'point',
 'analysi',
 'gener',
 'represent',
 'vector',
 'field',
 'topolog',
 'numer',
 'flow',
 'data',
 'set',
 'discuss',
 'critic',
 'point',
 'locat',
 'character',
 'twodimension',
 'domain',
 'may',
 'either',
 'twodimension',
 'flow',
 'field',
 'tangenti',
 'veloc',
 'field',
 'near',
 'threedimension',
 'bodi',
 'tangent',
 'curv',
 'integr',
 'along',
 'princip',
 'direct',
 'certain',
 'class',
 'critic',
 'point',
 'point',
 'curv',
 'link',
 'form',
 'skeleton',
 'repres',
 'twodimension',
 'vector',
 'field',
 'topolog',
 'gener',
 'tangenti',
 'veloc',
 'field',
 'near',
 'bodi',
 'threedimension',
 'flow',
 'skeleton',
 'includ',
 'critic',
 'point',
 'curv',
 'provid',
 'basi',
 'analyz',
 'threedimension',
 'structur',
 'flow',
 'separ']

# Exploratory Data Analysis (EDA)

### How many abstracts were presented each year?

In [11]:
to_plot1 = df.groupby('Year').count().reset_index()
alt.Chart(to_plot1).mark_line().encode(
    x = "Year:Q",
    y = "Title:Q"
)

### How many total papers are in each track?

In [12]:
to_plot2 = df.groupby('Conference').count().reset_index()

alt.Chart(to_plot2).mark_bar().encode(
    x = "Conference:N",
    y = "Abstract:Q",
)

### How many abstracts were presented by different tracks each year?

In [13]:
to_plot3 = df.groupby(['Conference', 'Year']).count().reset_index()
alt.Chart(to_plot3,height=150).mark_line().encode(
    x = "Year:Q",
    y = "Title:Q",
    color = 'Conference:N'
)

# TF-IDF Values

We generate tf-idf values for all the words in the abstract to evaluate how relevant a word is to the document.

In [14]:
def get_tfidf(input_data):
    # Create Dictionary
    id2word = corpora.Dictionary(input_data)
    model = TfIdfTransformer(dictionary=id2word)
    
    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in input_data]
    num_docs = id2word.num_docs
    num_terms = len(id2word.keys())
    
    # Get tfidf matrix
    tfidf_corpus = model.fit_transform(corpus)
    corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)
    
    # Map index back to word
    words = []
    for i in id2word.keys():
        words.append(id2word[i])
    mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)
    wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)
    return wordtfidf

# Clustering

We cluster the words based on their tf-idf values to group words into four different clusters.

In [15]:
tfidf_df = pd.DataFrame(get_tfidf(df['Abstract']))

In [16]:
cluster_model = KMeans(n_clusters=4)
cluster = cluster_model.fit(tfidf_df)
tfidf_cluster_df = pd.DataFrame(tfidf_df)

In [17]:
# top 10 words for each cluster

group_key_words = []
for label in range(4):
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = tfidf_cluster_df[cluster.labels_ == label].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([label, to_sort[i]['word'], to_sort[i]['freq']])
keyword_df = pd.DataFrame(data=group_key_words, columns=['label', 'keyword', 'tfidf'])

In [18]:
keyword_df

Unnamed: 0,label,keyword,tfidf
0,0,volum,0.054995
1,0,render,0.053144
2,0,surfac,0.050781
3,0,mesh,0.03843
4,0,algorithm,0.033684
5,0,isosurfac,0.029684
6,0,function,0.026991
7,0,imag,0.025695
8,0,method,0.024374
9,0,textur,0.023339


In [19]:
chart = alt.hconcat()

for label in range(4):
    chart |= alt.Chart(keyword_df[keyword_df['label']==label]).mark_bar().encode(
        x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.12]),title='cluster'+str(label)),
        y=alt.Y('keyword:N', sort='-x')
    ).properties(
        width = 50
    )

In [20]:
chart

# Track Topics

We create four documents for each track and add all the words from papers in the track to each document. After, we get tf-idf values for all the words.

In [21]:
# Create four documents for each track and add all words from papers in the track to each document

track_key_words = []
tracks = ['Vis', 'InfoVis', 'VAST', 'SciVis']

for track in tracks:
    document = []
    track_index = df[df['Conference']==track].index.values.astype(int)
    for i in track_index:
        document.extend(df['Abstract'][i])
    track_key_words.append(document)

In [22]:
track_key_words

[['use',
  'critic',
  'point',
  'analysi',
  'gener',
  'represent',
  'vector',
  'field',
  'topolog',
  'numer',
  'flow',
  'data',
  'set',
  'discuss',
  'critic',
  'point',
  'locat',
  'character',
  'twodimension',
  'domain',
  'may',
  'either',
  'twodimension',
  'flow',
  'field',
  'tangenti',
  'veloc',
  'field',
  'near',
  'threedimension',
  'bodi',
  'tangent',
  'curv',
  'integr',
  'along',
  'princip',
  'direct',
  'certain',
  'class',
  'critic',
  'point',
  'point',
  'curv',
  'link',
  'form',
  'skeleton',
  'repres',
  'twodimension',
  'vector',
  'field',
  'topolog',
  'gener',
  'tangenti',
  'veloc',
  'field',
  'near',
  'bodi',
  'threedimension',
  'flow',
  'skeleton',
  'includ',
  'critic',
  'point',
  'curv',
  'provid',
  'basi',
  'analyz',
  'threedimension',
  'structur',
  'flow',
  'separ',
  'author',
  'discu',
  'fast',
  'flow',
  'analysi',
  'softwar',
  'toolkit',
  'implement',
  'softwar',
  'system',
  'fluid',
  'mecha

In [23]:
track_tfidf = get_tfidf(track_key_words)

In [24]:
track_tfidf

Unnamed: 0,Unnamed: 1,0,02,043spl,05,055,1,10,100,1000,...,wwwinviwoorg,wysiwyp,xct,xlinkhref24tvcg01jallepalli2744058ieq1sourcetif,xlinkhref24tvcg01zhang2744038ieq1sourcetif,xlinkhref24tvcg01zhang2744038ieq2sourcetif,xmlnsxlinkhttpwwww3org1999xlinkalternativesinlineformulacontinu,xmlnsxlinkhttpwwww3org1999xlinkalternativesinlineformulato1,xmlnsxlinkhttpwwww3org1999xlinkalternativesinlineformulaway,ϵsimplif
0,0.0,0.003713,0.003713,0.003713,0.001856,0.003713,0.0,0.0,0.0,0.001541,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.004464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.004205,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005765,...,0.013891,0.027783,0.055566,0.013891,0.013891,0.013891,0.013891,0.013891,0.013891,0.013891


In [25]:
track_tfidf['Track'] = tracks

# Visualization

After grouping the words into the different tracks and retrieving their tf-idf values, we find the top ten words in each track by the words with the highest tf-idf values in each track.

In [26]:
# Top 10 words for each track

group_key_words = []
for track in tracks:
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = track_tfidf[track_tfidf['Track'] == track].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([track, to_sort[i]['word'], to_sort[i]['freq']])
keyword_df = pd.DataFrame(data=group_key_words, columns=['track', 'keyword', 'tfidf'])

  group_df = track_tfidf[track_tfidf['Track'] == track].mean(axis=0)


In [27]:
keyword_df

Unnamed: 0,track,keyword,tfidf
0,Vis,isosurfac,0.517898
1,Vis,textur,0.242682
2,Vis,ray,0.232033
3,Vis,vortex,0.180058
4,Vis,spl,0.170777
5,Vis,tetrahedr,0.126226
6,Vis,projector,0.120657
7,Vis,lod,0.107663
8,Vis,unsteadi,0.096526
9,Vis,viewdepend,0.096526


In [28]:
chart = alt.hconcat()

for track in tracks:
    chart |= alt.Chart(keyword_df[keyword_df['track']== track], height=150).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.53]),title=track),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 50
)

In [29]:
chart