# Combine data

In [1]:
import pandas as pd

In [3]:
results_0 = pd.read_csv('../results_0.csv')
results_1 = pd.read_csv('../results_1.csv')
results_2 = pd.read_csv('../results_2.csv')
results_3 = pd.read_csv('../results_3.csv')

results = pd.concat([results_0, results_1, results_2, results_3]).set_index('Package')
results = results.sort_index()

# len before drop na
print(len(results))

results = results.dropna()

# final len
print(len(results))

25789
1081


In [4]:
# save to csv
results.to_csv('../results.csv')

# Make graph

In [5]:
# load results
results = pd.read_csv('../results.csv')
results.head()

Unnamed: 0,Package,DOI,Description,Version,In Bioconductor since,Imports,Suggests,biocViews,URL,Authors,Maintainer,Unique_IPs_2024,Downloads_2024
0,ACE,10.18129/B9.bioc.ACE,Uses segmented copy number data to estimate tu...,1.24.0,BioC 3.8 (R-3.5) (6.5 years),"Biobase , QDNAseq , ggplot2 , grid, stats, uti...","knitr , rmarkdown , BiocStyle","CopyNumberVariation , Coverage , DNASeq , Sequ...",https://github.com/tgac-vumc/ACE,Jos B PoellMaintainer:Jos B Poell <j.poell at ...,Jos B Poell <j.poell at amsterdamumc.nl>,1221.0,4713.0
1,ADAMgui,10.18129/B9.bioc.ADAMgui,ADAMgui is a Graphical User Interface for the ...,1.22.0,BioC 3.9 (R-3.6) (6 years),"GO.db (>= 3.5.0), dplyr (>= 0.7.6), shiny (>= ...","markdown , BiocStyle","GeneSetEnrichment , KEGG , Pathways , Software",TBA,"Giordano Bruno Sanches Seco [aut], AndrÃ© Luiz...",Jose Luiz Rybarczyk Filho <jose.luiz at unesp.br>,1209.0,4478.0
2,ADaCGH2,10.18129/B9.bioc.ADaCGH2,Analysis and plotting of array CGH data. Allow...,2.46.0,BioC 2.7 (R-2.12) (14.5 years),"bit , DNAcopy , tilingArray , waveslim , clust...","CGHregions , Cairo , limma","CopyNumberVariants , Microarray , Software",https://github.com/rdiaz02/adacgh2,Ramon Diaz-Uriarte <rdiaz02 at gmail.com> and ...,Ramon Diaz-Uriarte <rdiaz02 at gmail.com>,1294.0,5147.0
3,AHMassBank,10.18129/B9.bioc.AHMassBank,Supplies AnnotationHub with MassBank metabolit...,1.6.1,BioC 3.17 (R-4.3) (2 years),AnnotationHubData (>= 1.5.24),"BiocStyle , knitr , AnnotationHub (>= 2.7.13),...","AnnotationHubSoftware , MassSpectrometry , Sof...",https://github.com/jorainer/AHMassBank,Johannes Rainer [cre]Maintainer:Johannes Raine...,Johannes Rainer <Johannes.Rainer at eurac.edu>,772.0,2376.0
4,ALDEx2,10.18129/B9.bioc.ALDEx2,A differential abundance analysis for the comp...,1.38.0,BioC 3.0 (R-3.1) (10.5 years),"Rfast , BiocParallel , GenomicRanges , IRanges...","testthat , BiocStyle , knitr , rmarkdown , pur...","Bayesian , ChIPSeq , DNASeq , DifferentialExpr...",https://github.com/ggloor/ALDEx_bioc,"Greg Gloor, Andrew Fernandes, Jean Macklaim, A...",Greg Gloor <ggloor at uwo.ca>,12034.0,24743.0


In [7]:
import networkx as nx

INCLUDE_SUGGESTS = False
OUTPUT_NAME = '../biocgraph.gexf'

# make a graph of the packages and their dependencies
G = nx.DiGraph()

for idx, row in results.iterrows():
    package = results.iloc[idx]['Package']
    for dependency in row['Imports'].split(', '):
        dependency = dependency.strip().split(' ')[0]
        G.add_edge(package, dependency)

    if INCLUDE_SUGGESTS:
        for suggestion in row['Suggests'].split(', '):
            suggestion = suggestion.strip().split(' ')[0]
            G.add_edge(package, suggestion)
    
    # add node attributes (version, in bioconductor since, description, unique IPs)
    G.nodes[package]['Version'] = row['Version']
    bioc_version, r_version, bioc_year = row['In Bioconductor since'].split('(')
    bioc_year = bioc_year.rstrip(')')
    G.nodes[package]['BioC_version'] = bioc_version
    G.nodes[package]['BioC_year'] = bioc_year
    G.nodes[package]['Description'] = row['Description']
    G.nodes[package]['Unique_IPs_2024'] = row['Unique_IPs_2024']
    G.nodes[package]['biocViews'] = row['biocViews']
    G.nodes[package]['URL'] = row['URL']
    G.nodes[package]['DOI'] = row['DOI']
    G.nodes[package]['Authors'] = row['Authors']

# save graph as graphml
nx.write_gexf(G, OUTPUT_NAME)