In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from datetime import datetime
import streamlit as st

# Set style for visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'wordcloud'

In [None]:
# Load the metadata
df = pd.read_csv('data/metadata.csv')

In [None]:
# Basic exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False))

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# Convert publish_time to datetime
df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')

# Extract year from publication date
df_clean['year'] = df_clean['publish_time'].dt.year

# Create abstract word count column
df_clean['abstract_word_count'] = df_clean['abstract'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)

# Handle missing values
# For this analysis, we'll focus on papers with titles and abstracts
df_clean = df_clean.dropna(subset=['title', 'abstract'])

# Filter to COVID-19 relevant years (2019-2022)
df_clean = df_clean[df_clean['year'].between(2019, 2022)]

print(f"Cleaned dataset shape: {df_clean.shape}")

In [None]:
# 1. Publications by year
year_counts = df_clean['year'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
year_counts.plot(kind='bar')
plt.title('Number of Publications by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 2. Top journals
top_journals = df_clean['journal'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_journals.plot(kind='barh')
plt.title('Top 10 Journals by Publication Count')
plt.xlabel('Count')
plt.ylabel('Journal')
plt.tight_layout()
plt.show()

In [None]:
# 3. Word cloud of titles
title_text = ' '.join(df_clean['title'].dropna().astype(str))

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(title_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.tight_layout()
plt.show()

In [None]:
# 4. Abstract length distribution
plt.figure(figsize=(10, 6))
plt.hist(df_clean['abstract_word_count'], bins=50, edgecolor='black')
plt.title('Distribution of Abstract Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()