## Load RDFLib 4.2.2

In [1]:
from rdflib import Graph

In [2]:
!pip show rdflib

Name: rdflib
Version: 4.2.2
Summary: RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information.
Home-page: https://github.com/RDFLib/rdflib
Author: RDFLib Team
Author-email: rdflib-dev@google.com
License: https://raw.github.com/RDFLib/rdflib/master/LICENSE
Location: /Users/wdduncan/opt/anaconda3/lib/python3.7/site-packages
Requires: isodate, pyparsing
Required-by: -iolinkml, SPARQLWrapper, sparql-slurper, rdflib-jsonld, PyShExC, PyShEx, CFGraph, biolinkml


## Parse RDF/XML version of GO
#### go.owl is saved locally
#### takes ~2 min 8 sec on my machine 

In [3]:
graph1 = Graph()

In [4]:
%time graph1.parse("go.owl")

CPU times: user 2min 28s, sys: 1.31 s, total: 2min 29s
Wall time: 2min 30s


<Graph identifier=N4c8f06bc8a3a44a897b29ba129c17831 (<class 'rdflib.graph.Graph'>)>

## Parse turtle version of GO
#### go.ttl is saved locally
#### takes ~1 min 42 sec on my machine 

In [5]:
graph2 = Graph()

In [6]:
%time graph2.parse("go.ttl", format="turtle")

CPU times: user 1min 56s, sys: 596 ms, total: 1min 56s
Wall time: 1min 56s


<Graph identifier=N9125062b679741989b52b5de987ab78e (<class 'rdflib.graph.Graph'>)>

## Save version of GO as N-triples

In [7]:
%time graph2.serialize("go.nt", format="nt")

CPU times: user 10.4 s, sys: 430 ms, total: 10.8 s
Wall time: 10.8 s


## Parse N-triples version of GO
#### go.nt is saved locally
#### takes ~1 min 8 sec on my machine 

In [8]:
graph3 = Graph()

In [9]:
%time graph3.parse("go.nt", format="nt")

CPU times: user 1min 20s, sys: 1.09 s, total: 1min 21s
Wall time: 1min 21s


<Graph identifier=N7a0a9dec97624834be683cd95ecf0042 (<class 'rdflib.graph.Graph'>)>

## Parse RDF/XML version of GO-Plus
#### go-plus.owl is saved locally
#### takes ~3 min 18 sec on my machine 

In [10]:
graph4 = Graph()

In [11]:
%time graph4.parse("go-plus.owl")

CPU times: user 4min 4s, sys: 2.5 s, total: 4min 7s
Wall time: 4min 7s


<Graph identifier=Nce91dc9dabc84739bebd0d1cd299fba0 (<class 'rdflib.graph.Graph'>)>

## Save version of GO-Plus as N-triples

In [12]:
%time graph4.serialize("go-plus.nt", format="nt")

CPU times: user 15.7 s, sys: 604 ms, total: 16.3 s
Wall time: 16.4 s


## Parse N-triples version of GO-Plus
#### go-plus.nt is saved locally
#### takes ~1 min 50 sec on my machine 

In [13]:
graph5 = Graph()

In [14]:
%time graph5.parse("go-plus.nt", format="nt")

CPU times: user 1min 50s, sys: 1.57 s, total: 1min 51s
Wall time: 1min 51s


<Graph identifier=Nae87276a360f4cb7a61ce8af64856f8c (<class 'rdflib.graph.Graph'>)>

## Split GO-Plus in into multiple files

In [15]:
from fsplit.filesplit import FileSplit

In [16]:
!pip show filesplit

Name: filesplit
Version: 2.0.0
Summary: Module to split file of any size into multiple chunks
Home-page: https://github.com/ram-jayapalan/filesplit
Author: Ram Prakash Jayapalan
Author-email: ramp16888@gmail.com
License: UNKNOWN
Location: /Users/wdduncan/opt/anaconda3/lib/python3.7/site-packages
Requires: 
Required-by: 


In [17]:
fs = FileSplit(file="go-plus.nt", splitsize=50000000, output_dir="./go-plus-files/") # split into 50M chuncks
fs.split()

In [18]:
!ls -l ./go-plus-files/

total 676888
-rw-r--r--  1 wdduncan  staff  49999978 Mar  3 17:09 go-plus_1.nt
-rw-r--r--  1 wdduncan  staff  49999984 Mar  3 17:09 go-plus_2.nt
-rw-r--r--  1 wdduncan  staff  49999883 Mar  3 17:09 go-plus_3.nt
-rw-r--r--  1 wdduncan  staff  49999841 Mar  3 17:09 go-plus_4.nt
-rw-r--r--  1 wdduncan  staff  49999990 Mar  3 17:09 go-plus_5.nt
-rw-r--r--  1 wdduncan  staff  49999177 Mar  3 17:09 go-plus_6.nt
-rw-r--r--  1 wdduncan  staff  46550922 Mar  3 17:09 go-plus_7.nt


## Parse each GO-Plus split and combine

In [19]:
import os

In [20]:
go_plus_files = [f"go-plus-files/{f}" for f in os.listdir("./go-plus-files/")]
go_plus_files

['go-plus-files/go-plus_4.nt',
 'go-plus-files/go-plus_1.nt',
 'go-plus-files/go-plus_5.nt',
 'go-plus-files/go-plus_2.nt',
 'go-plus-files/go-plus_6.nt',
 'go-plus-files/go-plus_7.nt',
 'go-plus-files/go-plus_3.nt']

In [21]:
graph6 = Graph()

In [22]:
%%time
temp_graph = Graph()
for go_plus_file in [f"go-plus-files/{f}" for f in os.listdir("./go-plus-files/")]:
    temp_graph.parse(go_plus_file, format="nt")
    graph6 += temp_graph

CPU times: user 5min 16s, sys: 2.12 s, total: 5min 18s
Wall time: 5min 19s


In [23]:
%%time
graph6_1 = Graph()
graph6_2 = Graph()
graph6_3 = Graph()
graph6_4 = Graph()
graph6_5 = Graph()
graph6_6 = Graph()
graph6_7 = Graph()

graph6_1.parse("go-plus-files/go-plus_1.nt", format="nt")
graph6_2.parse("go-plus-files/go-plus_2.nt", format="nt")
graph6_3.parse("go-plus-files/go-plus_3.nt", format="nt")
graph6_4.parse("go-plus-files/go-plus_4.nt", format="nt")
graph6_5.parse("go-plus-files/go-plus_5.nt", format="nt")
graph6_6.parse("go-plus-files/go-plus_6.nt", format="nt")
graph6_7.parse("go-plus-files/go-plus_7.nt", format="nt")

graph6 = graph6_1 + graph6_2 + graph6_3 + graph6_4 + graph6_5 + graph6_6 + graph6_7 

CPU times: user 5min 29s, sys: 2.43 s, total: 5min 31s
Wall time: 5min 32s


## Try asyncio

In [91]:
import asyncio, datetime

In [25]:
!pip show asyncio 

Name: asyncio
Version: 3.4.3
Summary: reference implementation of PEP 3156
Home-page: http://www.python.org/dev/peps/pep-3156/
Author: UNKNOWN
Author-email: UNKNOWN
License: UNKNOWN
Location: /Users/wdduncan/opt/anaconda3/lib/python3.7/site-packages
Requires: 
Required-by: 


In [85]:
async def make_graph(file_path:str, format="nt"):
    g = Graph()
    return g.parse(file_path, format=format)

In [103]:
start_time = datetime.datetime.now()

g1, g2, g3, g4, g5, g6, g7 = await asyncio.gather(make_graph("go-plus-files/go-plus_1.nt"),
                                                  make_graph("go-plus-files/go-plus_2.nt"),
                                                  make_graph("go-plus-files/go-plus_3.nt"),
                                                  make_graph("go-plus-files/go-plus_4.nt"),
                                                  make_graph("go-plus-files/go-plus_5.nt"),
                                                  make_graph("go-plus-files/go-plus_6.nt"),
                                                  make_graph("go-plus-files/go-plus_7.nt"))
graph_union = g1 + g2 + g3 + g4 + g5 + g6 + g7

end_time = datetime.datetime.now()
delta = end_time - start_time
milli_sec = int(delta.total_seconds() * 1000) # milliseconds

print("milliseconds: ", milli_sec)
print("seconds: ", milli_sec/1000)

milliseconds:  333424
seconds:  333.424


In [104]:
start_time = datetime.datetime.now()

g1 = await make_graph("go-plus-files/go-plus_1.nt")
g2 = await make_graph("go-plus-files/go-plus_2.nt")
g3 = await make_graph("go-plus-files/go-plus_3.nt")
g4 = await make_graph("go-plus-files/go-plus_4.nt")
g5 = await make_graph("go-plus-files/go-plus_5.nt")
g6 = await make_graph("go-plus-files/go-plus_6.nt")
g7 = await make_graph("go-plus-files/go-plus_7.nt")

graph_union = g1 + g2 + g3 + g4 + g5 + g6 + g7

end_time = datetime.datetime.now()
delta = end_time - start_time
milli_sec = int(delta.total_seconds() * 1000) # milliseconds

print("milliseconds: ", milli_sec)
print("seconds: ", milli_sec/1000)

milliseconds:  305464
seconds:  305.464


In [105]:
start_time = datetime.datetime.now()

g1 = await make_graph("go-plus-files/go-plus_1.nt")
g2 = await make_graph("go-plus-files/go-plus_2.nt")
g3 = await make_graph("go-plus-files/go-plus_3.nt")
g4 = await make_graph("go-plus-files/go-plus_4.nt")
g5 = await make_graph("go-plus-files/go-plus_5.nt")
g6 = await make_graph("go-plus-files/go-plus_6.nt")
g7 = await make_graph("go-plus-files/go-plus_7.nt")

# graph_union = g1 + g2 + g3 + g4 + g5 + g6 + g7

end_time = datetime.datetime.now()
delta = end_time - start_time
milli_sec = int(delta.total_seconds() * 1000) # milliseconds

print("milliseconds: ", milli_sec)
print("seconds: ", milli_sec/1000)

milliseconds:  141830
seconds:  141.83
