In [None]:
import pandas as pd

# Load metadata.csv
df = pd.read_csv('data/cleaned_metadata.csv')

# Display first few rows
df.head()


In [None]:
# Display shape and info
print("Data Shape:", df.shape)
print("\nData Info:")
print(df.info())

# Check missing values
print("\nMissing Values by Column:")
print(df.isnull().sum())

# Display basic stats for numerical columns
print("\nNumerical Statistics:")
print(df.describe())


In [None]:
# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract year
df['year'] = df['publish_time'].dt.year

# Create abstract word count column
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))

# Check cleaned data
df.info()


In [None]:
import matplotlib.pyplot as plt

year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(10,5))
plt.bar(year_counts.index, year_counts.values, color='skyblue')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.title('Publications by Year')
plt.show()


In [None]:
top_journals = df['journal'].value_counts().head(10)
plt.figure(figsize=(10,5))
top_journals.plot(kind='bar', color='orange', title='Top Journals Publishing COVID-19 Research')
plt.ylabel('Number of Publications')
plt.show()


In [None]:
from collections import Counter
import re

all_titles = ' '.join(df['title'].dropna())
words = re.findall(r'\w+', all_titles.lower())
word_freq = Counter(words).most_common(20)
print("Top 20 Words in Titles:")
print(word_freq)


In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Paper Titles")
plt.show()


In [None]:
# Key insights:
# - Total papers: df.shape[0]
# - Top journals: top_journals
# - Publication trends: year_counts
# - Most common words: word_freq
#
# Challenges:
# - Handling missing data
# - Large dataset can be slow to process
# - Abstract text cleaning
