# Generating statistics for subset of Wikidata

This notebook illustrates how to generate statistics for a subset of Wikidata. \
We use as an example https://www.wikidata.org/wiki/Q11173 (chemical compound).

Example Dataset wikidata subset: https://drive.google.com/drive/u/1/folders/1KjNwV5M2G3JzCrPgqk_TSx8wTE49O2Sx \
Example Dataset statistics: https://drive.google.com/drive/u/0/folders/1_4Mxd0MAo0l9aR3aInv0YMTJrtneh7HW 

### Example Invocation command

    papermill /Users/shashanksaurabh/Desktop/MS/ISI/isi/kgtk_shashank73744/kgtk/examples/Example_9-Wikidata_Subset_Statistics_.ipynb \
    /Users/shashanksaurabh/Desktop/MS/ISI/isi/kgtk_shashank73744/kgtk/examples/Example_9_output.ipynb \
    -p wikidata_home '/Users/shashanksaurabh/Desktop/Data_isi' \
    -p wikidata_parts_folder '/Users/shashanksaurabh/Desktop/Data_isi/Chemical' \
    -p cache_folder '/Users/shashanksaurabh/Desktop/Data_isi/Temp' \
    -p output_folder '/Users/shashanksaurabh/Desktop/Data_isi/output' \
    -p delete_database 'yes' \
    -p K \"10\" \
    -p subset_name 'Q11173'


In [3]:
wikidata_home = "/Users/shashanksaurabh/Desktop/Data_isi"

# path to folder which contains all files corresponding to the wikidata subset. 
#(For more information on wikidata subset please check Example 8)
wikidata_parts_folder = "/Users/shashanksaurabh/Desktop/Data_isi/Chemical"

# The notebook creates a cache, which is present in the cache_folder. The cache can be deleted after the execution.
cache_folder = "/Users/shashanksaurabh/Desktop/Data_isi/Temp"

# path to the folder where the output (here statistics) would be stored
output_folder = "/Users/shashanksaurabh/Desktop/Data_isi/output"

# In each of statistics top K results are chosen.
#In the following examples this has been implemented using the --limit attribute.
K = "5"

# This represents the Qnode of wikidata subset.
subset_name = "Q11173"

In [4]:
import io
import os
import subprocess
import sys

import numpy as np
import pandas as pd

from IPython.display import display, HTML, Markdown, Image

from kgtk.kypher.sqlstore import *

# import altair as alt
# alt.renderers.enable('altair_viewer')

# from IPython.display import display, HTML, Image
# from pandas_profiling import ProfileReport

### Set up environment variables and folders that we need

In [5]:
# path to folder which contains all files corresponding to the wikidata subset. 
#(For more information on wikidata subset please check Example 8)
os.environ['WIKIDATA_PARTS'] = wikidata_parts_folder

# path to the folder where the output (here statistics) would be stored
os.environ['OUTPUT_FOLDER'] = output_folder

# The statistics are stored in two different folders, overview folder and class folder.
# If the folders are not present then in the following command they are created
if not os.path.isdir(os.path.join(output_folder,"overview_folder")):
    os.mkdir(os.path.join(output_folder,"overview_folder"))
if not os.path.isdir(os.path.join(output_folder,"classes")):
    os.mkdir(os.path.join(output_folder,"classes"))

# Environmnet variable for the two sub folders where the statistics would be stored  
os.environ['OVERVIEW_FOLDER'] = os.path.join(output_folder,"overview_folder")
os.environ['CLASS_FOLDER'] = os.path.join(output_folder,"classes")

# kgtk command to run
os.environ['kgtk'] = "kgtk"
os.environ['kgtk'] = "time kgtk --debug"

# absolute path of the db
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(cache_folder)
os.environ['K'] = K
os.environ['label'] = subset_name + ".label.en.tsv.gz"

# file name corresponding to different part of the Wikidata subgraph.
os.environ['subset_name']  = subset_name
os.environ['external_id']  = subset_name + ".part.external-id.tsv.gz"
os.environ['time']  = subset_name +  ".part.time.tsv.gz"
os.environ['wikibase_item']  = subset_name + ".part.wikibase-item.tsv.gz"
os.environ['quantity']  = subset_name +  ".part.quantity.tsv.gz"
os.environ['statistics']  = subset_name + ".statistics.tsv.gz"
os.environ['wikibase_form']  = subset_name + ".part.wikibase-form.tsv.gz"
os.environ['monolingualtext']  = subset_name + ".part.monolingualtext.tsv.gz"
os.environ['math']  = subset_name + ".part.math.tsv.gz"
os.environ['commonsMedia']  = subset_name + ".part.commonsMedia.tsv.gz"
os.environ['globe_coordinate']  = subset_name + ".part.globe-coordinate.tsv.gz"
os.environ['musical_notation']  = subset_name + ".part.musical-notation.tsv.gz"
os.environ['geo_shape']  = subset_name + ".part.geo-shape.tsv.gz"
os.environ['url']  = subset_name + ".part.url.tsv.gz"
os.environ['string']  = subset_name + ".part.string.tsv.gz"
os.environ['alias']  = subset_name + ".alias.en.tsv.gz"
os.environ['description']  = subset_name + ".description.en.tsv.gz"

# Output file corresponding to the overview folder (contains overview part of the statistics)
os.environ['class_summary']  = subset_name + "_class_summary.tsv"
os.environ['stats']  = subset_name + "_stats.tsv"
os.environ['all_degree']  = subset_name + "_allDegree.tsv"
os.environ['degree']  = subset_name + "_degree.tsv"
os.environ['class_summary']  = subset_name + "_class_summary.tsv"
os.environ['temp']  = subset_name + "_temp_tsv.tsv"

# Output files corresponding to the class summary folder (contains class summary part of the statistics)
os.environ['property_summary_external_id']  = subset_name + "_property_summary_externaID.tsv"
os.environ['property_summary_time']  = subset_name + "_property_summary_time.tsv"
os.environ['property_summary_wikibase_item']  = subset_name + "_property_summary_wikibaseItem.tsv"
os.environ['property_summary_quantity']  = subset_name + "_property_summary_Quantity.tsv"
os.environ['property_summary_wikibase_form']  = subset_name + "_property_summary_wikibaseForm.tsv"
os.environ['property_summary_monolingualtext']  = subset_name + "_property_summary_monoLingualText.tsv"
os.environ['property_summary_math']  = subset_name + "_property_summary_math.tsv"
os.environ['property_summary_commonsMedia']  = subset_name + "_property_summary_commonsMedia.tsv"
os.environ['property_summary_globe_coordinate']  = subset_name + "_property_summary_globeCoordinate.tsv"
os.environ['property_summary_musical_notation']  = subset_name + "_property_summary_musicalNotation.tsv"
os.environ['property_summary_geo_shape']  = subset_name + "_property_summary_geoShape.tsv"
os.environ['property_summary_url']  = subset_name + "_property_summary_url.tsv"
os.environ['property_summary_string']  = subset_name + "_property_summary_string.tsv"

In [6]:
FNULL = open(os.devnull, 'w')
def run_command(cmd, substitution_dictionary = {}):
    """Run a templetized command."""
    debug = False
    for k, v in substitution_dictionary.items():
        cmd = cmd.replace(k, v)
    
#     print(cmd)
    if debug:
        output = subprocess.run([cmd], shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(output.stdout)
        print(output.stderr)
    else:
        output = subprocess.run([cmd], shell=True, universal_newlines=True, stdout=FNULL, stderr=subprocess.PIPE)
    #print(output.returncode)

In [7]:
# This function is used to print the string in different font, colors and font-size.
def printmd(string,color='black',size='25',fontWeight="bold"):
    colorstr = "<span style='font-weight: {};margin-top=80px;margin-bottom=80px; font-size:{}; color:{}'>{}</span>".format(fontWeight,str(size)+'px',color, string)
    display(HTML(colorstr))

In [8]:
# Its  a helper function which given a qnode or pnode returns the link corresponding to it.
def generate_link(string):
    if string[0] == 'P':
        return "https://www.wikidata.org/wiki/Property:"+string 
    elif string[0] == 'Q':
        return "https://www.wikidata.org/wiki/"+string
    return string

In [9]:
#Types contain tuple of the environmnet variable corresponding to the input property file, 
#the name of the Datatype and environmnet variable corresponding to the output file.
types = [
        ("time","Time","property_summary_time"),
        ("wikibase_item","Wikibase_item","property_summary_wikibase_item"),
        ("math","Math","property_summary_math"),
        ("wikibase_form","Wikibase-form","property_summary_wikibase_form"),
        ("quantity","Quantity","property_summary_quantity"),
        ("string","String","property_summary_string"),
        ("external_id","External-id","property_summary_external_id"),
        ("commonsMedia","CommonsMedia","property_summary_commonsMedia"),
        ("globe_coordinate","Globe-coordinate","property_summary_globe_coordinate"),
        ("monolingualtext","Monolingualtext","property_summary_monolingualtext"),
        ("musical_notation","Musical-notation","property_summary_musical_notation"),
        ("geo_shape","Geo-shape","property_summary_geo_shape"),
        ("url","Url","property_summary_url"),
    ]
try:
    number_of_node = 0
    number_of_edges = 0
    number_of_classes = 0
    number_of_properties = 0
    for type_,name,file in types:
        temp = pd.read_csv(os.path.join(os.getenv('WIKIDATA_PARTS'),os.getenv(type_)),delimiter='\t')
        number_of_properties += temp['label'].nunique()
        number_of_edges+=temp['node1'].count()
        if (name=="Wikibase_item"):
            number_of_node+=temp['node1'].nunique()
            number_of_classes = temp[temp['label']=='P279']['node2'].nunique()
    df_stats = pd.DataFrame([["Number of Nodes",number_of_node],["Number of Edges",number_of_edges],["Number of Classes",number_of_classes],["Number of Properties",number_of_properties]],columns=['Statistics', 'Value'])
    printmd("Overview of the Graph",'blue')
    printmd("Number of Nodes in the Graph: "+str(number_of_node),size=15,fontWeight='Light')
    printmd("Number of Edges in the Graph: "+str(number_of_edges),size=15,fontWeight='Light')
    printmd("Number of Classes in the Graph: "+str(number_of_classes),size=15,fontWeight='Light')
    printmd("Number of Properties in the Graph: "+str(number_of_properties),size=15,fontWeight='Light')
    df_stats.to_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv('stats')),sep='\t')
except Exception as e:
    print(e)

In [10]:
try:
    # This query finds the statistics for all the nodes in wikibase_item and save it in a file
    cmd = "$kgtk query -i $WIKIDATA_PARTS/$wikibase_item -i $WIKIDATA_PARTS/$statistics --graph-cache $STORE \
    -o $OVERVIEW_FOLDER/$all_degree \
    --match 'item: (n1)-[l]->(n2), statistics:(n1)-[stats_property{label: llab}]->(stats) ' \
    --return 'distinct n1 as qnode, llab as `Property`, stats as Value' "
    run_command(cmd, {"__subset_name": subset_name})
    temp = pd.read_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv('all_degree')),delimiter='\t')
    indegree_avg  = temp[temp['Property']=='vertex_in_degree']['Value'].mean()
    indegree_min  = temp[temp['Property']=='vertex_in_degree']['Value'].min()
    indegree_max = temp[temp['Property']=='vertex_in_degree']['Value'].max()
    outdegree_avg  = temp[temp['Property']=='vertex_out_degree']['Value'].mean()
    outdegree_min  = temp[temp['Property']=='vertex_out_degree']['Value'].min()
    outdegree_max = temp[temp['Property']=='vertex_out_degree']['Value'].max()
    df_degree = pd.DataFrame([["in-degree",indegree_avg,indegree_min,indegree_max],["out-degree",outdegree_avg,outdegree_min,outdegree_max]],columns=["Stat","Average","Min","Max"])
    df_degree.to_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv('degree')),sep='\t')
    printmd("Overview of the Graph",'blue')
    printmd("Average in-degree of the Graph: "+str(indegree_avg),size=15,fontWeight='Light')
    printmd("Minimum in-degree of the Graph: "+str(indegree_min),size=15,fontWeight='Light')
    printmd("Maximum in-degree of the Graph: "+str(indegree_max),size=15,fontWeight='Light')
    printmd("Average out-degree of the Graph: "+str(outdegree_avg),size=15,fontWeight='Light')
    printmd("Minimum out-degree of the Graph: "+str(int(outdegree_min)),size=15,fontWeight='Light')
    printmd("Maximum out-degree of the Graph: "+str(int(outdegree_max)),size=15,fontWeight='Light')
except Exception as e:
    print(e)

In [11]:
try:
    # This query finds all the classes based on number of Instances
    cmd = "$kgtk query -i $WIKIDATA_PARTS/$wikibase_item -i $WIKIDATA_PARTS/$label -i $WIKIDATA_PARTS/$statistics --graph-cache $STORE \
    -o $OVERVIEW_FOLDER/$class_summary \
    --match 'item: (n1)-[l{label:llab}]->(n2), label: (n2)-[:label]->(label_n2),statistics:(n2)-[:vertex_pagerank]->(pagerank) ' \
    --return 'distinct n2 as Link, kgtk_lqstring_text(label_n2) as `Class_Label`, count(distinct n1) as `Number of Instances`, pagerank as Pagerank' \
    --where 'label_n2.kgtk_lqstring_lang_suffix = \"en\" AND (llab IN [\"P31\"])  ' \
    --order-by 'count(distinct n1) desc' "
    run_command(cmd)
    
    # Load the Dataframe with the file created in the above query
    df_class_summary = pd.read_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv('class_summary')),delimiter='\t')
    
    #The statistics for top K properties are generated and sum of Number of instances for 
    #the remaining properties are stored in other_instances
    other_instances = df_class_summary[int(K):]["Number of Instances"].sum()
    
    #Take top K properties and disregard rest of the properties
    df_class_summary = df_class_summary[:int(K)]
    
    #Take top K properties and disregard rest of the properties
    df_class_summary = df_class_summary[["Class_Label","Number of Instances","Pagerank","Link"]]
    
    df_class_summary = df_class_summary.append({"Class_Label":"Other Classes","Number of Instances":other_instances,"Pagerank":"NA","Link":"NA"},ignore_index=True)
    
    #Generate hyperlinks from the qnode/pnode
    df_class_summary['Link'] = df_class_summary['Link'].apply(generate_link)
    
    #Save the dataframe to the output file
    df_class_summary.to_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv('class_summary')),sep='\t')
    printmd("Class Summary of the Subgraph",'blue')
    printmd("Below is the list of Top Five Classes of the Subgraph ordered based on number of instances",size="15",fontWeight="Light")
    display(HTML(df_class_summary.to_html(index=False)))
except Exception as e:
    print(e)

Class_Label,Number of Instances,Pagerank,Link
chemical compound,1063060,0.0627481,https://www.wikidata.org/wiki/Q11173
medication,2507,0.000111443,https://www.wikidata.org/wiki/Q12140
diacylglycerophosphocholine,485,2.45038e-05,https://www.wikidata.org/wiki/Q63436503
carcinogen,479,2.7155e-05,https://www.wikidata.org/wiki/Q187661
wax monoester,410,2.07372e-05,https://www.wikidata.org/wiki/Q63446172
Other Classes,14172,,


In [71]:
try:
    df_property_summary = []

    #Types contain tuple of the environmnet variable corresponding to the input property file, 
    #the name of the Datatype and environmnet variable corresponding to the output file.
    types = [
        ("time","Time","property_summary_time"),
        ("wikibase_item","Wikibase Item","property_summary_wikibase_item"),
        ("math","Mathematical Expression","property_summary_math"),
        ("wikibase_form","Wikibase Form","property_summary_wikibase_form"),
        ("quantity","Quantity","property_summary_quantity"),
        ("string","String","property_summary_string"),
        ("external_id","External Id","property_summary_external_id"),
        ("commonsMedia","Common Media","property_summary_commonsMedia"),
        ("globe_coordinate","Globe Coordinate","property_summary_globe_coordinate"),
        ("monolingualtext","Monolingualtext","property_summary_monolingualtext"),
        ("musical_notation","Musical Notation","property_summary_musical_notation"),
        ("geo_shape","Geo Shape","property_summary_geo_shape"),
        ("url","Url","property_summary_url"),
    ]

    # This query finds all the properties ordered based on Number of Statements
    cmd = "$kgtk query  -i $WIKIDATA_PARTS/$TYPE_FILE -i $WIKIDATA_PARTS/$label --graph-cache $STORE \
    -o $OVERVIEW_FOLDER/$output_file \
    --match 'part: (n1)-[l{label: llab}]->(n2), label: (llab)-[:label]->(label)' \
    --return 'distinct llab as Link, kgtk_lqstring_text(label) as `Property_Label`, count(llab) as `Number_of_Statements`' \
    --where 'label.kgtk_lqstring_lang_suffix = \"en\" ' \
    --order-by 'count(llab) desc '"
    
    # Do it for all the Datatypes
    for type,name, output_file in types:
        run_command(cmd, {"TYPE_FILE": type,"output_file":output_file})
        
        #Create dataframe object corresponding the file generated by the above command
        temp = pd.read_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv(output_file)),delimiter='\t')
        
        #The statistics for top K properties are generated and sum of Number of statements for 
        #the remaining properties are stored in other_instances
        other_instances = temp[int(K):]["Number_of_Statements"].sum()
        
        #Take top K properties and disregard rest of the properties
        temp = temp[:int(K)]
        
        #Generated hyperlinks from the qnode/pnode
        temp['Link'] = temp['Link'].apply(generate_link)
        
        #Changing the order of the columns
        temp = temp[["Property_Label","Number_of_Statements","Link"]]
        
        temp = temp.append({"Property_Label":"Other Properties","Number_of_Statements":other_instances,"Link":"NA"},ignore_index=True)
        
        # Storing the temp to output file
        temp.to_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv(output_file)),sep='\t')
        
        # The output if there are no properties for a Datatype
        if len(temp) == 1:
            printmd("Datatype: "+name, 'blue')
            printmd("No Property is present for Datatype:" + name,size=15, fontWeight='Light')
            printmd("------------------------------------------------------------------------------------------")
            continue
        printmd("Datatype: "+name, 'blue')
        printmd("Below are the top five properties of Datatype:" + name +"  ordered based on number of statements",size =15,fontWeight='Light')
        df_property_summary.append(temp)
        display(HTML(df_property_summary[-1].to_html(index=False)))
        printmd("------------------------------------------------------------------------------------------")
except Exception as e:
    print(e)

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Time</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Time  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
time of discovery or invention,20,https://www.wikidata.org/wiki/Property:P575
discontinued date,1,https://www.wikidata.org/wiki/Property:P2669
inception,1,https://www.wikidata.org/wiki/Property:P571
service entry,1,https://www.wikidata.org/wiki/Property:P729
service retirement,1,https://www.wikidata.org/wiki/Property:P730
Other Properties,0,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Wikibase Item</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Wikibase Item  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
catalog,49434,https://www.wikidata.org/wiki/Property:P972
has part,15973,https://www.wikidata.org/wiki/Property:P527
part of,9430,https://www.wikidata.org/wiki/Property:P361
medical condition treated,6121,https://www.wikidata.org/wiki/Property:P2175
active ingredient in,4178,https://www.wikidata.org/wiki/Property:P3780
Other Properties,15487,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Mathematical Expression</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>No Property is present for Datatype:Mathematical Expression</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Wikibase Form</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>No Property is present for Datatype:Wikibase Form</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Quantity</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Quantity  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
mass,146546,https://www.wikidata.org/wiki/Property:P2067
melting point,9494,https://www.wikidata.org/wiki/Property:P2101
density,1113,https://www.wikidata.org/wiki/Property:P2054
defined daily dose,908,https://www.wikidata.org/wiki/Property:P4250
boiling point,873,https://www.wikidata.org/wiki/Property:P2102
Other Properties,6085,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: String</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:String  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
chemical formula,344854,https://www.wikidata.org/wiki/Property:P274
canonical SMILES,199853,https://www.wikidata.org/wiki/Property:P233
isomeric SMILES,141589,https://www.wikidata.org/wiki/Property:P2017
Commons category,3463,https://www.wikidata.org/wiki/Property:P373
NIOSH Pocket Guide ID,615,https://www.wikidata.org/wiki/Property:P1931
Other Properties,1191,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: External Id</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:External Id  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
InChIKey,997389,https://www.wikidata.org/wiki/Property:P235
InChI,990406,https://www.wikidata.org/wiki/Property:P234
CAS Registry Number,927660,https://www.wikidata.org/wiki/Property:P231
DSSTox substance ID,848585,https://www.wikidata.org/wiki/Property:P3117
PubChem CID,250629,https://www.wikidata.org/wiki/Property:P662
Other Properties,853651,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Common Media</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Common Media  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
chemical structure,12520,https://www.wikidata.org/wiki/Property:P117
image,1992,https://www.wikidata.org/wiki/Property:P18
molecular model,20,https://www.wikidata.org/wiki/Property:P8224
pronunciation audio,17,https://www.wikidata.org/wiki/Property:P443
spoken text audio,6,https://www.wikidata.org/wiki/Property:P989
Other Properties,8,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Globe Coordinate</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Globe Coordinate  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
coordinate location,1,https://www.wikidata.org/wiki/Property:P625
Other Properties,0,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Monolingualtext</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Monolingualtext  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
World Health Organisation International Nonproprietary Name,2414,https://www.wikidata.org/wiki/Property:P2275
native label,13,https://www.wikidata.org/wiki/Property:P1705
short name,9,https://www.wikidata.org/wiki/Property:P1813
name,4,https://www.wikidata.org/wiki/Property:P2561
official name,1,https://www.wikidata.org/wiki/Property:P1448
Other Properties,3,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Musical Notation</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>No Property is present for Datatype:Musical Notation</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Geo Shape</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>No Property is present for Datatype:Geo Shape</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:blue'>Datatype: Url</span>

<span style='font-weight: Light;margin-top=80px;margin-bottom=80px; font-size:15px; color:black'>Below are the top five properties of Datatype:Url  ordered based on number of statements</span>

Property_Label,Number_of_Statements,Link
exact match,41,https://www.wikidata.org/wiki/Property:P2888
official website,26,https://www.wikidata.org/wiki/Property:P856
WordLift URL,4,https://www.wikidata.org/wiki/Property:P6363
Stack Exchange tag,1,https://www.wikidata.org/wiki/Property:P1482
equivalent class,1,https://www.wikidata.org/wiki/Property:P1709
Other Properties,3,


<span style='font-weight: bold;margin-top=80px;margin-bottom=80px; font-size:25px; color:black'>------------------------------------------------------------------------------------------</span>

In [12]:
# This is a helper function which finds the Label corrponding to a qnode/pnode of the Subgraph
# return the label of the node if found otherwise empty string
#@parametes
#node: node for which label needs to returned
#df_label: Dataframe corresponding to the label
def label_node(node,df_label):
    for ele in df_label[df_label['node1']==node]['node2']:
        if kgtk_lqstring_lang(ele) == "en":
            return kgtk_lqstring_text(ele)
    return ""
        

In [13]:
# This is a helper function which finds all the superclasses of a class
# returns the list of superclasses
#@parameters
#Class: node for which all the superclasses needs to be returned
#df_wikibase_item: Dataframe corresponding to the wikibase item
#df_label: Dataframe corresponding to the label
def direct_subclasses(Class,df_wikibase_item,df_label):
    result = []
    corr_ele = df_wikibase_item[(df_wikibase_item['label']=='P279') & (df_wikibase_item['node1']==Class)]['node2']
    for ele in corr_ele.values.tolist():
        result.append(label_node(ele,df_label))
    return result

In [14]:
# This is a helper function which finds all the direct subclasses of a class
# returns the list of subclasses
#@parameters
#Class: node for which all the subclasses needs to be returned
#df_wikibase_item: Dataframe corresponding to the wikibase item
#df_label: Dataframe corresponding to the label
def direct_superclasses(Class,df_wikibase_item,df_label):
    result = []
    for i,ele in enumerate(df_wikibase_item[(df_wikibase_item['label']=='P279') & (df_wikibase_item['node2']==Class)]['node1'].values.tolist()):
        result.append(label_node(ele,df_label))
    return result

In [17]:
try:    
    # Load the list of classes generated in the  9th cell
    df = pd.read_csv(os.path.join(os.getenv('OVERVIEW_FOLDER'),os.getenv('class_summary')),delimiter='\t')
    
    # Load the Descriptions of the nodes of the subgraph in a Dataframe
    df_description = pd.read_csv(os.path.join(os.getenv('WIKIDATA_PARTS'),os.getenv('description')),delimiter='\t')
    
    # Load the Aliases of the nodes  of the subgraph in a Dataframe
    df_alias = pd.read_csv(os.path.join(os.getenv('WIKIDATA_PARTS'),os.getenv('alias')),delimiter='\t')
    
    # Load the Wikibase item of the subgraph in a Dataframe
    df_wikibase_item = pd.read_csv(os.path.join(os.getenv('WIKIDATA_PARTS'),os.getenv('wikibase_item')),delimiter='\t')
    
    # Load the Labels of the nodes  of the subgraph in a Dataframe
    df_label = pd.read_csv(os.path.join(os.getenv('WIKIDATA_PARTS'),os.getenv('label')),delimiter='\t')

    #query to find the example instances of a class
    cmd = "$kgtk query -i $WIKIDATA_PARTS/$wikibase_item  -i $WIKIDATA_PARTS/$label -i $WIKIDATA_PARTS/$statistics  --graph-cache $STORE \
    -o $CLASS_FOLDER/__output_file \
    --match 'item: (n1)-[l{label:llab}]->(n2:__class), label: (n1)-[:label]->(label), statistics:(n1)-[:vertex_pagerank]->(pagerank)' \
    --return ' distinct n1 as Link, kgtk_lqstring_text(label) as `Label_`, pagerank as Pagerank' \
    --where 'label.kgtk_lqstring_lang_suffix = \"en\" AND llab in [\"P31\"]' \
    --order-by 'pagerank' "
    
    # Do for all the classes generated in the 9th cell
    for index, ele in df.iterrows():
        
        # Ignore the Other Classes part of the statistics
        if index==len(df)-1:
            continue;
        # Parse the Qnode from the  Link
        Current_Qnode = ele["Link"].split("/")[-1]
        
        # Don't find the examples for the base class
        if Current_Qnode ==  subset_name:
            continue
            
        Number_of_instances_of_current_class = ele["Number of Instances"]
        
        # Heading
        printmd('Class: '+ ele["Class_Label"].capitalize(),'blue')
        
        # Find the list of all the description in English language
        description_array = [kgtk_lqstring_text(ele) for ele in df_description[df_description["node1"]==Current_Qnode]['node2'].values.tolist() if kgtk_lqstring_lang(ele) =='en']
        description_current = "".join(description_array[0] if len(description_array)>=1 else "")
        
        # Find the list of all the aliases in English language
        aliases_current = ", ".join([kgtk_lqstring_text(ele) for ele in df_alias[df_alias["node1"]==Current_Qnode]['node2'].values.tolist() if kgtk_lqstring_lang(ele) =='en'])
        
        # Find the list of all the subclasses in English language
        subclass_current = direct_subclasses(Current_Qnode,df_wikibase_item,df_label)
        
        # Find the list of all the super classes in English language
        direct_super_class_current = direct_superclasses(Current_Qnode,df_wikibase_item,df_label)
        
        # Print statments
        printmd('Description: '+ description_current.capitalize(),size=15, fontWeight='Light')
        printmd('Aliases: '+ aliases_current.capitalize(),size=15,fontWeight='Light')
        printmd('Subclass of: '+ ", ".join(subclass_current).capitalize(),size=15,fontWeight='Light')
        printmd('Direct Subclasses: '+ ", ".join(direct_super_class_current).capitalize(),size=15,fontWeight='Light')
        printmd("Number of Instances: " +str(Number_of_instances_of_current_class),size=15,fontWeight='Light')
        printmd("Number of Subclass: " +str(len(subclass_current)),size=15,fontWeight='Light')
        printmd("Number of Superclass: " +str(len(direct_super_class_current)),size=15,fontWeight='Light')
        printmd("Examples for " + ele["Class_Label"] +" Class",'black',size=18)
        
        #Create a dataframe corresponding to the above generated statistics
        class_overview = pd.DataFrame([["Description",description_current.capitalize()],["Aliases",aliases_current.capitalize()],["Subclass of",", ".join(subclass_current).capitalize()],["Number of Instances",Number_of_instances_of_current_class],["Number of Subclass",len(subclass_current)],["Number of Superclass",len(direct_super_class_current)]],columns=["Stat","Value"])
        class_overview.to_csv("os.path.join(os.getenv('CLASS_FOLDER')")
        
        #Dynamically create the name of the output file where the example instances would be stored
        output_file = subset_name + "_" + ele["Class_Label"].replace(" ","-")+"_examples"+".tsv"
        
        #Dynamically create the name of the output file where the overview would be stored
        output_file_overview = subset_name + "_" + ele["Class_Label"].replace(" ","-")+"_overview"+".tsv"
        
        # Save the overview to the output file
        class_overview.to_csv(os.path.join(os.getenv('CLASS_FOLDER'),output_file_overview),sep='\t')
        
        # Find the example instances of the class
        run_command(cmd, {"__output_file": output_file,"__class": Current_Qnode})
        
        # Load the examples in a Dataframe
        df_class_example = pd.read_csv(os.path.join(os.getenv('CLASS_FOLDER'),output_file),delimiter='\t')
        
        # Change the order of the Columns
        df_class_example =  df_class_example[["Label_","Pagerank","Link"]]
        df_class_example['Link'] = df_class_example['Link'].apply(generate_link)
        df_class_example.to_csv(os.path.join(os.getenv('CLASS_FOLDER'),output_file),sep='\t')
        display(HTML(df_class_example[:3].to_html(index=False)))
        printmd("------------------------------------------------------------------------------------------")
except Exception as e:
    display(e)

Label_,Pagerank,Link
dulaglutide,1e-06,https://www.wikidata.org/wiki/Q21011228
tezacaftor,1e-06,https://www.wikidata.org/wiki/Q27270940
lobeglitazone,1e-06,https://www.wikidata.org/wiki/Q18350076


Label_,Pagerank,Link
"1-hexadecanoyl-2-(9Z,12Z-octadecadienoyl)-sn-glycero-3-phosphocholine",1e-06,https://www.wikidata.org/wiki/Q27105002
1-stearoyl-2-palmitoyl-sn-glycero-3-phosphocholine,1e-06,https://www.wikidata.org/wiki/Q27145082
1-palmitoyl-2-acetyl-sn-glycero-3-phosphocholine,1e-06,https://www.wikidata.org/wiki/Q27145169


Label_,Pagerank,Link
bromodichloroacetic acid,1e-06,https://www.wikidata.org/wiki/Q27289240
glycidaldehyde,1e-06,https://www.wikidata.org/wiki/Q2307855
aniline,1e-05,https://www.wikidata.org/wiki/Q186414


Label_,Pagerank,Link
ethyl-2-nonynoate,1e-06,https://www.wikidata.org/wiki/Q27269581
cetyl myristate,1e-06,https://www.wikidata.org/wiki/Q27268650
2-pentyl butyrate,1e-06,https://www.wikidata.org/wiki/Q27285351


In [18]:
# $kgtk query -i $WIKIDATA_PARTS/$wikibase_item  -i $WIKIDATA_PARTS/$label -i $WIKIDATA_PARTS/$statistics  --graph-cache $STORE \
# -o $CLASS_FOLDER/__output_file \
# --match 'item: (n1)-[l{label:llab}]->(n2:__class), label: (n1)-[:label]->(label), statistics:(n1)-[:vertex_pagerank]->(pagerank)' \
# --return ' distinct n1 as Link, kgtk_lqstring_text(label) as `Label_`, pagerank as Pagerank' \
# --where 'label.kgtk_lqstring_lang_suffix = \"en\" AND llab in [\"P279\"]' \
# --order-by 'pagerank' \
# --limit 3 