# CORD-19 Data Analysis
Exploring COVID-19 research papers from the CORD-19 metadata.csv dataset.

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

sns.set_style('whitegrid')

## Step 1: Load the Dataset

In [None]:
# Load metadata.csv
try:
    df = pd.read_csv('metadata.csv')
    print('Dataset loaded successfully!')
except FileNotFoundError:
    print('metadata.csv not found. Make sure it is in the same folder.')

# Inspect first few rows
df.head()

## Step 2: Basic Exploration

In [None]:
# Shape of dataset
print(f'Total rows: {df.shape[0]}, Total columns: {df.shape[1]}')

# Data types and info
df.info()

# Check for missing values
df.isnull().sum()

# Basic statistics (if any numeric columns exist)
df.describe()

## Step 3: Data Cleaning

In [None]:
# Drop rows with missing title or publish_time
df_clean = df.dropna(subset=['title', 'publish_time'])

# Convert publish_time to datetime
df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')

# Extract publication year
df_clean['year'] = df_clean['publish_time'].dt.year

# Abstract word count (optional)
df_clean['abstract_word_count'] = df_clean['abstract'].fillna('').apply(lambda x: len(x.split()))

df_clean.head()

## Step 4: Data Analysis

In [None]:
# Publications by year
year_counts = df_clean['year'].value_counts().sort_index()

plt.figure(figsize=(10,5))
plt.bar(year_counts.index, year_counts.values, color='skyblue')
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.show()

In [None]:
# Top journals
top_journals = df_clean['journal'].value_counts().head(10)

plt.figure(figsize=(10,5))
top_journals.plot(kind='bar', color='salmon')
plt.title('Top Journals Publishing COVID-19 Papers')
plt.xlabel('Journal')
plt.ylabel('Number of Papers')
plt.show()

In [None]:
# Most frequent words in titles
titles = df_clean['title'].dropna().str.lower().str.split()
all_words = [word for sublist in titles for word in sublist]
word_freq = Counter(all_words)
common_words = word_freq.most_common(20)

words, counts = zip(*common_words)
plt.figure(figsize=(12,5))
sns.barplot(x=list(words), y=list(counts), palette='viridis')
plt.xticks(rotation=45)
plt.title('Top 20 Most Frequent Words in Titles')
plt.show()

In [None]:
# Distribution of paper counts by source
df_clean['source_x'].value_counts().plot(kind='bar', figsize=(8,4), color='lightgreen', title='Paper Counts by Source')
plt.show()

## Step 5: Observations

- Publications increased significantly in 2020 and 2021 due to COVID-19.
- Top journals publishing COVID-19 research include `medRxiv`, `bioRxiv`, and others.
- Common words in titles include 'covid', 'coronavirus', 'sars', etc.
- Most papers come from preprint servers like `medRxiv` and `bioRxiv`.