# DBLP Anomaly Detection

**Team Member:** Julio Amaya

Detecting outliers, off-topic venues, and atypical collaborations in the DBLP dataset.

## 1. Setup and Imports

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import warnings

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

project_root = Path('.').resolve()
sys.path.insert(0, str(project_root))

print('✓ Imports loaded')

## 2. Data Loading

In [None]:
DATA_DIR = project_root / 'data' / 'parquet'
OUTPUT_DIR = project_root / 'data' / 'derived'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

papers = pd.read_parquet(DATA_DIR / 'papers')
citations_df = pd.read_parquet(DATA_DIR / 'citations')
coauthorships_df = pd.read_parquet(DATA_DIR / 'coauthorships')
authorships = pd.read_parquet(DATA_DIR / 'authorships')

print(f'Papers:         {papers.shape[0]:>10,} rows')
print(f'Citations:      {citations_df.shape[0]:>10,} rows')
print(f'Coauthorships:  {coauthorships_df.shape[0]:>10,} rows')
print(f'Authorships:    {authorships.shape[0]:>10,} rows')

## 3. Citation Outliers: Extreme Citation Patterns

In [None]:
in_degree = citations_df.groupby('dst_id').size().reset_index(name='in_degree')
citation_stats = papers[['id', 'title', 'year', 'venue', 'n_citation']].copy()
citation_stats = citation_stats.merge(in_degree, left_on='id', right_on='dst_id', how='left').fillna(0)
citation_stats['in_degree'] = citation_stats['in_degree'].astype(int)
citation_stats['citation_ratio'] = citation_stats['n_citation'] / (citation_stats['in_degree'] + 1)

scaler = StandardScaler()
citation_stats['z_citation'] = np.abs(scaler.fit_transform(citation_stats[['n_citation']]))
citation_stats['z_indegree'] = np.abs(scaler.fit_transform(citation_stats[['in_degree']]))

citation_outliers = citation_stats[(citation_stats['z_citation'] > 3) | (citation_stats['z_indegree'] > 3)].copy()

print(f'✓ Citation Outliers: {len(citation_outliers):,}')
print('\nTop 10 Citation Outliers:')
if len(citation_outliers) > 0:
    display(citation_outliers.nlargest(10, 'n_citation')[['title', 'year', 'venue', 'n_citation', 'in_degree']])

## 4. Venue Anomalies: Off-Topic Papers

In [None]:
papers_clean = papers.dropna(subset=['title', 'venue']).copy()
print(f'Papers with title and venue: {len(papers_clean):,}')

vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(papers_clean['title'])

venue_anomalies = []
for venue in papers_clean['venue'].value_counts().head(20).index:
    venue_papers = papers_clean[papers_clean['venue'] == venue]
    if len(venue_papers) < 10:
        continue
    venue_tfidf = tfidf_matrix[venue_papers.index]
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    scores = iso_forest.fit_predict(venue_tfidf.toarray())
    for idx, s in zip(venue_papers.index, scores):
        if s == -1:
            venue_anomalies.append({'venue': venue, 'paper_id': papers_clean.loc[idx, 'id']})

venue_anomalies_df = pd.DataFrame(venue_anomalies)
print(f'✓ Venue Anomalies: {len(venue_anomalies_df):,}')

## 5. Co-authorship Cliques: Atypical Collaborations

In [None]:
print('Building co-authorship graph...')
G = nx.Graph()
edges = list(zip(coauthorships_df['author1_norm'], coauthorships_df['author2_norm']))
G.add_edges_from(edges)

print(f'✓ Graph: {G.number_of_nodes():,} authors, {G.number_of_edges():,} collaborations')

print('Computing cliques...')
largest_cc = max(nx.connected_components(G), key=len)
G_sub = G.subgraph(largest_cc).copy()

cliques = sorted(nx.find_cliques(G_sub), key=len, reverse=True)
print(f'✓ Total cliques: {len(cliques):,}')
if cliques:
    print(f'  Max clique size: {len(cliques[0])}')

anomaly_cliques = []
for clique in cliques[:1000]:
    if 3 <= len(clique) <= 8:
        sg = G_sub.subgraph(clique)
        density = nx.density(sg)
        if density > 0.7:
            anomaly_cliques.append({'size': len(clique), 'density': round(density, 3)})

if anomaly_cliques:
    df_cliques = pd.DataFrame(anomaly_cliques)
    print(f'✓ Dense cliques (density > 0.7): {len(df_cliques)}')
else:
    print('✓ No highly anomalous cliques detected')

## 6. Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(citation_stats['n_citation'], bins=50, alpha=0.7, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Citation Count')
axes[0].set_ylabel('Frequency (log scale)')
axes[0].set_title('Citation Count Distribution')
axes[0].set_yscale('log')
axes[0].grid(alpha=0.3)

axes[1].scatter(citation_stats['in_degree'], citation_stats['n_citation'], alpha=0.5, s=10, label='All papers')
if len(citation_outliers) > 0:
    axes[1].scatter(citation_outliers['in_degree'], citation_outliers['n_citation'], 
                    color='red', s=100, alpha=0.7, label='Outliers', edgecolor='darkred')
axes[1].set_xlabel('In-Degree (Citation Edges)')
axes[1].set_ylabel('n_citation (Self-Reported)')
axes[1].set_title('Citation Patterns: In-Degree vs n_citation')
axes[1].set_xscale('log')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Export Results

In [None]:
# Create summary
summary_df = pd.DataFrame({
    'Type': ['Citation Outliers', 'Venue Anomalies', 'Co-authorship Cliques'],
    'Count': [len(citation_outliers), len(venue_anomalies_df), len(anomaly_cliques) if anomaly_cliques else 0]
})

print('\n' + '='*60)
print('ANOMALY DETECTION SUMMARY')
print('='*60)
print(summary_df.to_string(index=False))
print('='*60)

# Export CSVs
citation_outliers[['id', 'title', 'year', 'venue', 'n_citation', 'in_degree']].to_csv(
    OUTPUT_DIR / 'anomalies_citation_outliers.csv', index=False
)
print(f'\n✓ Exported: anomalies_citation_outliers.csv ({len(citation_outliers)} rows)')

if len(venue_anomalies_df) > 0:
    venue_anomalies_df.to_csv(OUTPUT_DIR / 'anomalies_venue_offTopic.csv', index=False)
    print(f'✓ Exported: anomalies_venue_offTopic.csv ({len(venue_anomalies_df)} rows)')

summary_df.to_csv(OUTPUT_DIR / 'anomaly_detection_summary.csv', index=False)
print(f'✓ Exported: anomaly_detection_summary.csv')
print(f'\n✓ All results saved to data/derived/')