In [None]:
import pandas as pd
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns


# Load data
csv_url = "https://www.kaggle.com/datasets/allen-institute-for-ai/CORD-19-research-challenge?select=metadata.csv"

df = pd.read_csv("metadata.csv")

# Explore first rows
print(df.head())
print(df.shape)
print(df.info())

# Missing values
print(df.isnull().sum().sort_values(ascending=False))

# Convert dates
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract year
df['year'] = df['publish_time'].dt.year

# Handle missing abstracts: drop or fill with empty string
df['abstract'] = df['abstract'].fillna("")

# Create abstract word count
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))

# Publications per year
year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(10,5))
sns.barplot(x=year_counts.index, y=year_counts.values)
plt.title("Publications by Year")
plt.xticks(rotation=45)
plt.show()

# Top journals
top_journals = df['journal'].value_counts().head(10)
top_journals.plot(kind="bar", figsize=(10,5), title="Top Journals")

# Word Cloud from titles
titles = " ".join(df['title'].dropna().astype(str).values)
wc = WordCloud(width=800, height=400, background_color="white").generate(titles)
plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Paper Titles")
plt.show()

# Distribution by source
df['source_x'].value_counts().plot(kind="barh", figsize=(8,5), title="Publications by Source")


# Load data
@st.cache_data
def load_data():
    return pd.read_csv("data/metadata.csv")

df = load_data()
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year

# Title
st.title("CORD-19 Research Papers Explorer")
st.write("Interactive dashboard to explore COVID-19 research metadata.")

# Sidebar filters
year_filter = st.sidebar.slider("Select Year", int(df['year'].min()), int(df['year'].max()), (2019,2021))
filtered_df = df[(df['year'] >= year_filter[0]) & (df['year'] <= year_filter[1])]

# Show data sample
st.subheader("Sample Data")
st.write(filtered_df.head())

# Publications per year
st.subheader("Publications by Year")
year_counts = filtered_df['year'].value_counts().sort_index()
fig, ax = plt.subplots()
sns.barplot(x=year_counts.index, y=year_counts.values, ax=ax)
ax.set_title("Publications by Year")
st.pyplot(fig)

# Top Journals
st.subheader("Top Journals")
top_journals = filtered_df['journal'].value_counts().head(10)
fig, ax = plt.subplots()
sns.barplot(y=top_journals.index, x=top_journals.values, ax=ax)
ax.set_title("Top 10 Journals")
st.pyplot(fig)

# Word Cloud
st.subheader("Word Cloud of Titles")
titles = " ".join(filtered_df['title'].dropna().astype(str).values)
wc = WordCloud(width=800, height=400, background_color="white").generate(titles)
fig, ax = plt.subplots(figsize=(10,5))
ax.imshow(wc, interpolation="bilinear")
ax.axis("off")
st.pyplot(fig)

# Source distribution
st.subheader("Publications by Source")
fig, ax = plt.subplots()
filtered_df['source_x'].value_counts().head(10).plot(kind="bar", ax=ax)
ax.set_title("Top Sources")
st.pyplot(fig)

