In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

df = pd.read_csv("metadata.csv")

print(df.head())
print("Shape:", df.shape)
print(df.info())

df = df.dropna(subset=["title", "publish_time"])

df["publish_time"] = pd.to_datetime(df["publish_time"], errors='coerce')
df = df.dropna(subset=["publish_time"])
df["year"] = df["publish_time"].dt.year

df["abstract_word_count"] = df["abstract"].fillna("").apply(lambda x: len(x.split()))

year_counts = df["year"].value_counts().sort_index()

plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values, color='skyblue')
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Publications")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("pubs_by_year.png")
plt.show()

top_journals = df["journal"].value_counts().head(10)

plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, color='salmon')
plt.title("Top 10 Journals")
plt.xlabel("Number of Papers")
plt.ylabel("Journal")
plt.tight_layout()
plt.savefig("top_journals.png")
plt.show()

titles = " ".join(df["title"].astype(str).tolist())
wc = WordCloud(width=800, height=400, background_color="white").generate(titles)

plt.figure(figsize=(10,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Titles")
plt.tight_layout()
plt.savefig("title_wordcloud.png")
plt.show()

source_counts = df["source_x"].value_counts().head(10)

plt.figure(figsize=(8,5))
sns.barplot(y=source_counts.index, x=source_counts.values, color='lightgreen')
plt.title("Top 10 Sources")
plt.xlabel("Number of Papers")
plt.ylabel("Source")
plt.tight_layout()
plt.savefig("top_sources.png")
plt.show()


In [None]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

st.title("CORD-19 Data Explorer")
st.write("Simple exploration of COVID-19 research papers")

@st.cache_data
def load_data():
    df = pd.read_csv("metadata.csv")
    df = df.dropna(subset=["title", "publish_time"])
    df["publish_time"] = pd.to_datetime(df["publish_time"], errors='coerce')
    df = df.dropna(subset=["publish_time"])
    df["year"] = df["publish_time"].dt.year
    df["abstract_word_count"] = df["abstract"].fillna("").apply(lambda x: len(x.split()))
    return df

df = load_data()

year_min, year_max = int(df['year'].min()), int(df['year'].max())
selected_years = st.slider("Select year range", year_min, year_max, (year_min, year_max))

filtered = df[(df["year"] >= selected_years[0]) & (df["year"] <= selected_years[1])]

st.subheader("Sample of Data")
st.dataframe(filtered[["title","journal","publish_time"]].head(10))

st.subheader("Publications by Year")
year_counts = filtered["year"].value_counts().sort_index()

fig, ax = plt.subplots()
sns.barplot(x=year_counts.index, y=year_counts.values, ax=ax, color="skyblue")
ax.set_xlabel("Year")
ax.set_ylabel("Number of Publications")
st.pyplot(fig)

st.subheader("Top 10 Journals")
top_journals = filtered["journal"].value_counts().head(10)

fig, ax = plt.subplots()
sns.barplot(y=top_journals.index, x=top_journals.values, ax=ax, color="salmon")
ax.set_xlabel("Number of Papers")
ax.set_ylabel("Journal")
st.pyplot(fig)

st.subheader("Word Cloud of Titles")
titles = " ".join(filtered["title"].astype(str).tolist())
wc = WordCloud(width=800, height=400, background_color="white").generate(titles)

fig, ax = plt.subplots(figsize=(8,4))
ax.imshow(wc, interpolation="bilinear")
ax.axis("off")
st.pyplot(fig)
