In [None]:
# CORD-19 Dataset Analysis
# Analysis of Bias in COVID-19 Research Publications

# %% [markdown]
# # 1. Setup and Data Loading
# First, we'll import necessary libraries and load our dataset. We'll also perform initial data cleaning and preparation.

# %%
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# %% [markdown]
# ## 1.1 Load and Examine Data
# We'll load the CORD-19 dataset and examine its basic properties.

# %%
# Load the dataset with specified data types
dtype_dict = {
    'sha': str,
    'doi': str,
    'pmcid': str,
    'pubmed_id': str,
    'who_covidence_id': str,
    'arxiv_id': str,
    'pdf_json_files': str,
    'pmc_json_files': str
}

# Load the dataset
df = pd.read_csv('metadata.csv', dtype=dtype_dict)

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:")
print(df.isnull().sum())

# %% [markdown]
# # 2. Journal Analysis
# We'll analyze the distribution of publications across journals, including impact factors and accessibility.

# %%
# Analyze journal distribution
journal_counts = df['journal'].value_counts()

# Create a bar plot of top journals
plt.figure(figsize=(15, 8))
journal_counts.head(20).plot(kind='bar')
plt.title('Top 20 Journals by Number of Publications')
plt.xlabel('Journal')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 2.1 Journal Impact Analysis
# Let's analyze the relationship between journal impact factors and publication counts.
# Note: You'll need to add a dictionary of journal impact factors

# %%
# Create a dictionary of example impact factors (you should replace with actual data)
impact_factors = {
    'Nature': 49.962,
    'Science': 41.845,
    'The Lancet': 79.321,
    'PLOS ONE': 3.240,
    'bioRxiv': 'Preprint',
    'medRxiv': 'Preprint'
}

# %% [markdown]
# # 3. Temporal Analysis
# Analyze how publication patterns changed over time

# %%
# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
df['month'] = df['publish_time'].dt.month

# Create monthly publication counts
monthly_counts = df.groupby([df['publish_time'].dt.year, 
                           df['publish_time'].dt.month]).size()

# Plot temporal distribution
plt.figure(figsize=(15, 6))
monthly_counts.plot(kind='line')
plt.title('Number of Publications Over Time')
plt.xlabel('Time')
plt.ylabel('Number of Publications')
plt.grid(True)
plt.show()

# %% [markdown]
# # 4. Accessibility Analysis
# Analyze the accessibility of research papers

# %%
# Calculate accessibility metrics
accessibility_metrics = {
    'Has URL': (~df['url'].isna()).mean() * 100,
    'Has PDF': (~df['pdf_json_files'].isna()).mean() * 100,
    'Has PMC': (~df['pmc_json_files'].isna()).mean() * 100,
    'Has DOI': (~df['doi'].isna()).mean() * 100
}

# Create bar plot of accessibility metrics
plt.figure(figsize=(10, 6))
plt.bar(accessibility_metrics.keys(), accessibility_metrics.values())
plt.title('Research Accessibility Metrics')
plt.ylabel('Percentage of Papers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# %% [markdown]
# # 5. Topic Analysis
# Perform topic modeling to understand research focus areas

# %%
# Prepare text data for topic modeling
titles = df['title'].dropna()
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(titles)

# Perform LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Display top words for each topic
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

print_topics(lda, vectorizer.get_feature_names_out(), 10)

# %% [markdown]
# # 6. Statistical Analysis and Validation
# Perform statistical tests to validate our findings

# %%
# Example: Chi-square test for journal distribution
from scipy.stats import chi2_contingency

# Create contingency table
journal_type = pd.crosstab(df['journal'].notna(), df['pdf_json_files'].notna())
chi2, p_value, dof, expected = chi2_contingency(journal_type)

print("Chi-square test results:")
print(f"Chi-square statistic: {chi2}")
print(f"p-value: {p_value}")

