In [5]:
import sys
sys.path.append('../src')

In [6]:
from uscode import USCode
from citation_network import CitationNetwork
from search import SearchEngine

In [7]:
def id_to_location(elem_id):
    return tuple(div[1:] for div in elem_id.split('/'))

def location_info(loc):
    return "Title {:3} Section {:5} ".format(*loc)

In [None]:
usc = USCode.from_json('../data/usc201909.json')

In [None]:
cn = CitationNetwork(usc)
print(len(cn.sinks))

In [None]:
print("Nodes", len(cn.nodes))
print("Edges", len(cn.edges))
total_weight = sum(w for _, _, w in cn.edges.data('weight'))
print("Weight", total_weight)

In [None]:
sorted_indeg = sorted(cn.nodes, key=lambda x: cn.in_degree(x), reverse=True)
sorted_outdeg = sorted(cn.nodes, key=lambda x: cn.out_degree(x), reverse=True)

def show_node_info(node):
    print(location_info(id_to_location(node)),
          "In: {:>3}".format(cn.in_degree(node)),
          "Out: {:>3}".format(cn.out_degree(node)))

print("Highest Indegree:")
for node in sorted_indeg[:5]:
    show_node_info(node)

print("\nHighest Outdegree:")
for node in sorted_outdeg[:5]:
    show_node_info(node)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x_range = np.arange(1000, -1, -1)
y_indeg, y_outdeg = [], []

i1 = i2 = 0
for x in x_range:
    while i1 < len(sorted_indeg) and cn.in_degree(sorted_indeg[i1]) >= x:
        i1 += 1
    y_indeg.append(i1)
    
    while i2 < len(sorted_outdeg) and cn.out_degree(sorted_outdeg[i2]) >= x:
        i2 += 1
    y_outdeg.append(i2)
  
with np.errstate(divide='ignore'):
    x_range = np.log10(x_range)
    y_indeg = np.log10(np.array(y_indeg) / len(cn.nodes))
    y_outdeg = np.log10(np.array(y_outdeg) / len(cn.nodes))

fig, axs = plt.subplots(1, 2, figsize=(15, 7.5))
for ax, y_range, name in zip(axs, [y_indeg, y_outdeg], ["Indegree", "Outdegree"]):
    ax.scatter(x_range, y_range, marker='x')
    
    
    ax.set_title("Log-Log {} Distribution".format(name))
    ax.set_xlabel("Log {}".format(name))
    ax.set_ylabel("Log Probability")
    ax.set_xticks(range(4))
    ax.set_yticks(range(-5, 1))
plt.show()

In [None]:
se = SearchEngine(usc, network=cn)

In [None]:
def show_top_5(results):
    for res in results[:5]:
        print(location_info(res.result.location),
              "Occurrence: {:3} ".format(res.occurrence),
              "Indegree: {:3} ".format(cn.in_degree(res.result.id)),
              "PageRank: {:.2E} ".format(se.pagerank[res.result.id]))
        
results = se.search('copyright', mode='fulltext')

se.rank(results, signal='occurrence')
print("Top 5 by Occurrence:")
show_top_5(results)

se.rank(results, signal='indegree')
print("\nTop 5 by Indegree:")
show_top_5(results)

se.rank(results, signal='pagerank')
print("\nTop 5 by PageRank:")
show_top_5(results)

In [None]:
def show_top_5(results):
    for res in results[:5]:
        print(location_info(res.result.location),
              "Occurrence: {:3} ".format(res.occurrence),
              "Indegree: {:3} ".format(cn.in_degree(res.result.id)),
              "PageRank: {:.2E} ".format(se.pagerank[res.result.id]))
        
results = se.search('(copyright OR property) AND NOT legal', mode='boolean')

se.rank(results, signal='occurrence')
print("Top 5 by Occurrence:")
show_top_5(results)

se.rank(results, signal='indegree')
print("\nTop 5 by Indegree:")
show_top_5(results)

se.rank(results, signal='pagerank')
print("\nTop 5 by PageRank:")
show_top_5(results)