# CORD-19 Metadata Exploration
This notebook walks through the assignment: loading `metadata.csv`, cleaning, basic analysis, visualizations, and a short reflection.

Place `metadata.csv` in the same folder as this notebook or update the path in the loader cell.

In [None]:
# Imports and plotting setup
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_analysis import load_data, clean_data, publications_by_year, top_journals, title_word_frequency, plot_publications_by_year, plot_top_journals, plot_title_wordcloud, source_counts, plot_source_distribution

sns.set(style="whitegrid")
pd.options.display.max_columns = 50
%matplotlib inline

In [None]:
# Load a sample (adjust `nrows` to read the full file if you have enough memory)
path = 'metadata.csv'
nrows = 2000  # set to None to read all rows

try:
    df = load_data(path, nrows=nrows)
    print('Loaded', len(df), 'rows from', path)
except FileNotFoundError:
    print(f'File not found: {path}. Please place metadata.csv in the notebook folder or update `path`.')
    df = None

In [None]:
# Quick inspection: shape, columns, and first rows
if df is not None:
    print('Shape:', df.shape)
    display(df.head(3))
    print('
Column dtypes:')
    display(df.dtypes)
    print('
Basic info:')
    df.info()

In [None]:
# Missing values summary (show top columns with most missing values)
if df is not None:
    miss = df.isnull().sum().sort_values(ascending=False)
    display(miss.head(30))

In [None]:
# Clean the data using the helper and compare sizes
if df is not None:
    before = len(df)
    df_clean = clean_data(df)
    after = len(df_clean)
    print(f'Rows before: {before:,}  after cleaning: {after:,}')
    display(df_clean.head(3))

In [None]:
# Publications by year and plot
if df is not None:
    counts = publications_by_year(df_clean)
    display(counts)
    fig = plot_publications_by_year(counts)
    display(fig)

In [None]:
# Top journals and plot
if df is not None:
    top = top_journals(df_clean, top_n=20)
    display(top)
    figj = plot_top_journals(top)
    display(figj)

In [None]:
# Title word frequency and optional word cloud
if df is not None:
    wf = title_word_frequency(df_clean, top_n=50)
    display(wf[:20])
    try:
        figw = plot_title_wordcloud(wf)
        display(figw)
    except Exception as e:
        print('Word cloud not available:', e)

In [None]:
# Source distribution (top sources)
if df is not None:
    sc = source_counts(df_clean)
    display(sc.head(20))
    figs = plot_source_distribution(sc)
    display(figs)

## Reflection and next steps
- This notebook loaded and cleaned the CORD-19 `metadata.csv` and produced simple visualizations.
- Next steps: expand stopwords, use a larger sample or all rows, add interactive widgets using Streamlit (see `app.py`), and explore abstracts with NLP.
- If you'd like, I can add a small `metadata_sample.csv` (50 rows) so you can run the notebook and Streamlit app immediately.