# Paper Data Visualization

This notebook helps you explore and visualize the collected paper data from the database. Follow the sections below to load, inspect, and visualize the paper information.

In [1]:
# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import sqlite3

ModuleNotFoundError: No module named 'matplotlib'

## Load Paper Data

Connect to the SQLite database and load the paper data into a pandas DataFrame.

In [None]:
# Connect to the SQLite database and load paper data
db_path = '../papers.db'
conn = sqlite3.connect(db_path)

# List tables to help user know what is available
query_tables = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(query_tables, conn)
tables

In [None]:
# Example: Load the main paper table (replace 'papers' with your actual table name)
paper_table = 'papers'  # Change this if your table has a different name
try:
    papers_df = pd.read_sql(f'SELECT * FROM {paper_table}', conn)
except Exception as e:
    papers_df = None
    print(f"Error loading table: {e}")
papers_df.head() if papers_df is not None else None

## Display Basic Paper Information

Preview the first few rows and show basic statistics about the paper data.

In [None]:
# Show info and basic statistics
if papers_df is not None:
    display(papers_df.head())
    display(papers_df.describe(include='all'))
    print(f"Total papers: {len(papers_df)}")
else:
    print("No paper data loaded.")

## Visualize Paper Publication Years

Create a histogram or bar plot to visualize the distribution of publication years among the papers.

In [None]:
# Plot publication year distribution (replace 'year' with your actual column name)
if papers_df is not None and 'year' in papers_df.columns:
    plt.figure(figsize=(10, 5))
    sns.histplot(papers_df['year'].dropna(), bins=20, kde=False)
    plt.title('Distribution of Publication Years')
    plt.xlabel('Year')
    plt.ylabel('Number of Papers')
    plt.show()
else:
    print("Column 'year' not found in data.")

## Visualize Paper Authors

Generate a bar chart or word cloud to show the most frequent authors in the dataset.

In [None]:
# Visualize most frequent authors (replace 'authors' with your actual column name)
if papers_df is not None and 'authors' in papers_df.columns:
    from collections import Counter
    author_list = papers_df['authors'].dropna().str.split(';|,').explode().str.strip()
    top_authors = author_list.value_counts().head(20)
    plt.figure(figsize=(10, 6))
    sns.barplot(y=top_authors.index, x=top_authors.values, orient='h')
    plt.title('Top 20 Most Frequent Authors')
    plt.xlabel('Number of Papers')
    plt.ylabel('Author')
    plt.show()
    # Word cloud
    text = ' '.join(author_list.dropna())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Author Word Cloud')
    plt.show()
else:
    print("Column 'authors' not found in data.")

## Visualize Paper Keywords or Topics

Create visualizations (such as bar plots or word clouds) for the most common keywords or topics in the papers.

In [None]:
# Visualize most common keywords/topics (replace 'keywords' with your actual column name)
if papers_df is not None and 'keywords' in papers_df.columns:
    keyword_list = papers_df['keywords'].dropna().str.split(';|,').explode().str.strip()
    top_keywords = keyword_list.value_counts().head(20)
    plt.figure(figsize=(10, 6))
    sns.barplot(y=top_keywords.index, x=top_keywords.values, orient='h')
    plt.title('Top 20 Most Common Keywords/Topics')
    plt.xlabel('Number of Papers')
    plt.ylabel('Keyword/Topic')
    plt.show()
    # Word cloud
    text = ' '.join(keyword_list.dropna())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Keyword/Topic Word Cloud')
    plt.show()
else:
    print("Column 'keywords' not found in data.")