## Select auto vis papers

In [6]:
import pandas as pd
from typing import List

def filter_by_keywords(df: pd.DataFrame, keywords: List[str]) -> pd.DataFrame:
    """
    Filter dataframe rows by checking if any keyword appears in title, keywords or abstract
    
    Args:
        df: Input dataframe
        keywords: List of keywords to search for
        
    Returns:
        Filtered dataframe containing only rows where any of title, keywords or abstract contains any of the keywords
    """
    # Convert values and keywords to lowercase for case-insensitive matching
    mask = (
        df['Title'].str.lower().apply(lambda x: any(keyword.lower() in x for keyword in keywords)) |
        df['AuthorKeywords'].str.lower().apply(lambda x: any(keyword.lower() in str(x) for keyword in keywords)) |
        df['Abstract'].str.lower().apply(lambda x: any(keyword.lower() in str(x) for keyword in keywords))
    )
    return df[mask]

# Example usage:
df = pd.read_csv("../dataset.csv")
keywords = [
    "automated vis", "automatic vis", 
    "mixed initiative", "mixed-initiative",
    "visualization generation", "vis generation",
    "visualization recommendation",
    "agent"
    # "guidance",
]  
filtered_df = filter_by_keywords(df, keywords)
filtered_df.to_csv("filtered_dataset.csv", index=False)
print(f"Found {len(filtered_df)} papers containing keywords {keywords}")


Found 83 papers containing keywords ['automated vis', 'automatic vis', 'mixed initiative', 'mixed-initiative', 'visualization generation', 'vis generation', 'visualization recommendation', 'agent']


## Examine authors

In [7]:
def get_top_authors(csv_path: str, top_n: int = 10) -> pd.DataFrame:
    """
    Calculate the most important authors based on publication count and citation metrics
    
    Args:
        csv_path: Path to the filtered CSV dataset
        top_n: Number of top authors to return (default 10)
        
    Returns:
        DataFrame with author names and their impact metrics, sorted by overall impact score
    """
    # Read the filtered dataset
    df = pd.read_csv(csv_path)
    
    # Create author-paper mapping
    author_papers = []
    for _, row in df.iterrows():
        authors = row['AuthorNames-Deduped'].split(';')
        authors = [a.strip() for a in authors]
        
        # Get citation counts, defaulting to 0 if missing
        aminer_cites = row['AminerCitationCount'] if pd.notna(row['AminerCitationCount']) else 0
        crossref_cites = row['CitationCount_CrossRef'] if pd.notna(row['CitationCount_CrossRef']) else 0
        downloads = row['Downloads_Xplore'] if pd.notna(row['Downloads_Xplore']) else 0
        
        for author in authors:
            author_papers.append({
                'Author': author,
                'AminerCites': aminer_cites,
                'CrossRefCites': crossref_cites, 
                'Downloads': downloads
            })
    
    # Convert to DataFrame and group by author
    author_stats = pd.DataFrame(author_papers).groupby('Author').agg({
        'AminerCites': ['count', 'sum', 'mean'],
        'CrossRefCites': ['sum', 'mean'],
        'Downloads': ['sum', 'mean']
    })
    
    # Flatten column names
    author_stats.columns = ['PaperCount', 'TotalAminerCites', 'AvgAminerCites',
                          'TotalCrossRefCites', 'AvgCrossRefCites',
                          'TotalDownloads', 'AvgDownloads']
    
    # Calculate impact score (weighted combination of metrics)
    author_stats['ImpactScore'] = (
        0.4 * author_stats['TotalAminerCites'] + 
        0.3 * author_stats['TotalCrossRefCites'] +
        0.2 * author_stats['PaperCount'] +
        0.1 * author_stats['TotalDownloads'] / 1000  # Normalize downloads
    )
    
    # Sort by impact score and get top authors
    top_authors = author_stats.sort_values('ImpactScore', ascending=False).head(top_n)
    top_authors = top_authors.round(2).reset_index()
    
    return top_authors

# Example usage:
top_authors = get_top_authors("filtered_dataset.csv", 30)
print("\nTop authors in automated visualization research (by overall impact):")
print(top_authors)



Top authors in automated visualization research (by overall impact):
                  Author  PaperCount  TotalAminerCites  AvgAminerCites  \
0           Jeffrey Heer           5             841.0          168.20   
1         Dominik Moritz           4             718.0          179.50   
2              Bill Howe           2             712.0          356.00   
3            Alex Endert           7             519.0           74.14   
4      Jock D. Mackinlay           1             487.0          487.00   
5   Kanit Wongsuphasawat           1             487.0          487.00   
6          Anushka Anand           1             487.0          487.00   
7           Eli T. Brown           3             228.0           76.00   
8         Chenglong Wang           2             225.0          112.50   
9             Halden Lin           1             225.0          225.00   
10        Greg L. Nelson           1             225.0          225.00   
11    Adam M. Smith 0001           1      

## Temporal distribution

In [32]:
import pandas as pd
import plotly.express as px

# Read the dataset
df = pd.read_csv("filtered_dataset.csv")

# Create year distribution chart using bar chart instead of histogram
year_counts = df.groupby('Year').size().reset_index(name='count')
year_dist = px.bar(
    year_counts,
    x='Year',
    y='count',
    title='Distribution of Papers by Year',
    labels={'Year': 'Publication Year', 'count': 'Number of Papers'},
    width=600,
    height=400
)

# Update layout
year_dist.update_layout(
    title_x=0.5,
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12
)

year_dist


In [31]:
# Create stacked bar chart showing paper count by conference and year
conf_year_dist = df.groupby(['Year', 'Conference']).size().reset_index(name='count')

# Create stacked bar chart using plotly
fig = px.bar(
    conf_year_dist,
    x='Year',
    y='count',
    color='Conference',
    title='Distribution of Papers by Conference and Year',
    labels={
        'Year': 'Publication Year',
        'count': 'Number of Papers',
        'Conference': 'Conference'
    },
    width=800,
    height=500
)

# Update layout
fig.update_layout(
    title_x=0.5,
    title_font_size=16,
    xaxis_title_font_size=14, 
    yaxis_title_font_size=14,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12,
    legend_title_font_size=12,
    barmode='stack'
)

fig


## Keyword analysis

In [10]:
from collections import Counter
import numpy as np
import pandas as pd

def load_and_process_keywords(filepath):
    """Load dataset and process keywords into a list"""
    df = pd.read_csv(filepath)
    df['AuthorKeywords'] = df['AuthorKeywords'].fillna('')
    
    keywords_list = []
    for keywords in df['AuthorKeywords']:
        if not keywords:
            continue
        keywords = keywords.strip('[]')
        for keyword in keywords.split(','):
            keyword = keyword.strip().strip('"\'').lower()
            if keyword:
                keywords_list.append(keyword)
                
    return keywords_list

def get_keyword_frequencies(keywords_list):
    """Calculate frequency distribution of keywords"""
    keyword_freq = Counter(keywords_list)
    keywords = list(keyword_freq.keys())
    frequencies = list(keyword_freq.values())
    return keywords, frequencies

def calculate_text_sizes(frequencies, min_size=12, max_size=42):
    """Scale text sizes based on frequencies"""
    min_freq = min(frequencies)
    max_freq = max(frequencies)
    text_sizes = [min_size + ((max_size-min_size) * (f - min_freq)/(max_freq - min_freq)) 
                 for f in frequencies]
    return text_sizes

def create_word_cloud_plot(keywords, frequencies, text_sizes):
    """Create scatter plot word cloud visualization"""
    # Generate random positions
    np.random.seed(42)
    x_pos = np.random.uniform(-1, 1, len(keywords))
    y_pos = np.random.uniform(-1, 1, len(keywords))
    
    # Create scatter plot
    fig = px.scatter(
        x=x_pos,
        y=y_pos,
        size=[0]*len(keywords),
        text=keywords,
        title='Keyword Distribution',
        width=800,
        height=800
    )

    # Update styling
    fig.update_traces(
        textfont=dict(size=text_sizes),
        marker=dict(opacity=0),
        textposition='middle center'
    )
    
    fig.update_layout(
        title_x=0.5,
        title_font_size=16,
        showlegend=False,
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
    
    return fig

# Main execution
keywords_list = load_and_process_keywords("filtered_dataset.csv")
print(keywords_list)

keywords, frequencies = get_keyword_frequencies(keywords_list)
text_sizes = calculate_text_sizes(frequencies)
fig = create_word_cloud_plot(keywords, frequencies, text_sizes)
fig.show()


['user interfaces', 'information visualization', 'exploratory analysis', 'visualization recommendation', 'mixed-initiative systems', 'automated visualization design', 'perceptual effectiveness', 'constraints', 'knowledge bases', 'answer set programming', 'task taxonomy', 'design space', 'climate impact research', 'visualization recommendation', 'natural language generation', 'mixed-initiative interaction', 'visualization recommendation', 'data-driven communication', 'machine learning fairness', 'visual analytics', 'intersectional bias', 'subgroup discovery', 'user interactions', 'analytic provenance', 'visualization', 'applied machine learning', 'information graphics;visualization;design tools;2d graphics', 'deep q-network (dqn)', 'reinforcement learning', 'model interpretation', 'visual analytics', 'information visualization', 'visual storytelling', 'data story', 'visualization', 'design', 'encoding', 'perception', 'model', 'crowdsourcing', 'automated visualization', 'visual embedding

In [4]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# preprocessing

def run_topic_modeling(text_column: str, dataset_url: str = None):
    if dataset_url is None:
        dataset_url = "https://raw.githubusercontent.com/demoPlz/mini-template/main/studio/dataset.csv"

    df = pd.read_csv(dataset_url)
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in dataset.")

    df_clean = df[[text_column]].copy()
    df_clean[text_column] = df_clean[text_column].astype(str).str.strip()
    df_clean = df_clean[df_clean[text_column].str.len() > 10]
    df_clean = df_clean.dropna()

    texts = df_clean[text_column].tolist()

    vectorizer_model = CountVectorizer(stop_words="english")
    topic_model = BERTopic(vectorizer_model=vectorizer_model, min_topic_size=5)
    topics, probs = topic_model.fit_transform(texts)

    return topic_model, df_clean

# Run topic modeling on abstracts from filtered dataset
# topic_model, df_clean, html_str = run_topic_modeling("Abstract", "./filtered_dataset.csv")
topic_model, df_clean = run_topic_modeling("AuthorKeywords", "./filtered_dataset.csv")


print("\nTopic modeling results:")
print(topic_model.get_topic_info())


Topic modeling results:
   Topic  Count                                               Name  \
0     -1     16    -1_visualization_data_motif_visualizationvisual   
1      0     32             0_visualization_data_visual_provenance   
2      1     21   1_initiative_visual_humanmachine_mixedinitiative   
3      2      8  2_language_embedding_visualizationlarge_percep...   

                                      Representation  \
0  [visualization, data, motif, visualizationvisu...   
1  [visualization, data, visual, provenance, grap...   
2  [initiative, visual, humanmachine, mixedinitia...   
3  [language, embedding, visualizationlarge, perc...   

                                 Representative_Docs  
0  [Accessible data visualization,refreshable tac...  
1  [Visualization Tools,Visualization Recommendat...  
2  [cognitive bias,visual analytics,human-in-the-...  
3  [Visualization,Large Language Models,,,Visuali...  


## Author network

In [27]:
import math
import plotly.graph_objects as go
import networkx as nx
from itertools import combinations

# Read the dataset
df = pd.read_csv("./filtered_dataset.csv")

# Create graph
G = nx.Graph()

# Process each paper's authors
for authors in df["AuthorNames-Deduped"].str.split(";"):
    if isinstance(authors, list):
        # Add edges between all pairs of authors
        for author1, author2 in combinations(authors, 2):
            if G.has_edge(author1, author2):
                G[author1][author2]['weight'] += 1
            else:
                G.add_edge(author1, author2, weight=1)

# Calculate node positions using force-directed layout
pos = nx.spring_layout(G)

# Create edge trace
edge_x = []
edge_y = []
edge_weights = []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])
    edge_weights.append(edge[2]['weight'])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#333'),
    hoverinfo='none',
    mode='lines')

# Create node trace
node_x = []
node_y = []
node_text = []
node_size = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)
    node_size.append(math.sqrt(G.degree(node)*5))  # Size based on degree

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=node_size,
        color=[G.degree(node) for node in G.nodes()],
        line_width=2))

# Create figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Author Collaboration Network',
                    # titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.update_traces(text=None)
fig.show()
