# Prepare data

Importing COVID-19 Data

In [1]:
import json

with open('Resources/dpc-covid19-ita-province.json') as f:
    d = json.load(f)

Importing NetworkX and Pandas

In [2]:
import networkx as nx
import pandas as pd
import numpy as np

Cleaning data

In [3]:
# Create a DataFrame with COVID data, we need just some columns
city_dataframe = pd.DataFrame(d)[['denominazione_provincia', 'lat', 'long']].drop_duplicates()

print("Dataframe contains " + str(city_dataframe.count()[0]) + " rows")

# Remove data having latitude = 0 or longitude = 0 or provincia = "In fase di definizione/aggiornamento"
city_dataframe.drop(city_dataframe[(city_dataframe['lat'] == 0) | \
                                   (city_dataframe['long'] == 0) | \
                                   (city_dataframe['denominazione_provincia'] == 'In fase di definizione/aggiornamento') \
                                  ].index, inplace = True)

city_dataframe.reset_index(drop = True, inplace = True)
                        
print("After removing unusable data, Dataframe contains " + str(city_dataframe.count()[0]) + " rows")

Dataframe contains 108 rows
After removing unusable data, Dataframe contains 107 rows


In [4]:
city_dataframe

Unnamed: 0,denominazione_provincia,lat,long
0,Chieti,42.351032,14.167546
1,L'Aquila,42.351222,13.398438
2,Pescara,42.464584,14.213648
3,Teramo,42.658918,13.704400
4,Matera,40.667512,16.597924
...,...,...,...
102,Rovigo,45.071073,11.790070
103,Treviso,45.667546,12.245074
104,Venezia,45.434905,12.338452
105,Verona,45.438390,10.993527


# Algorithms
We are not explicitly adding nodes to graph $\Rightarrow$ nodes without any edge will not be put in the graph
## 1. Iteration over all couples $\rightarrow$ Cost: $\mathcal{\Theta}\ (\ n^2\ )$

In [5]:
def allCoupleEdges(graph, dataframe, radius):
    # O (n)
    for i in dataframe.index:

        # O (n)
        for j in dataframe.index:
            if i != j and \
               dataframe.iloc[i, 1] - radius <= dataframe.iloc[j, 1] and \
               dataframe.iloc[i, 1] + radius >= dataframe.iloc[j, 1] and \
               dataframe.iloc[i, 2] - radius <= dataframe.iloc[j, 2] and \
               dataframe.iloc[i, 2] + radius >= dataframe.iloc[j, 2]:
                graph.add_edge(dataframe.iloc[i, 0], dataframe.iloc[j, 0])                        

## 2. Binary search on ordered dataframe $\rightarrow$ Cost: $\mathcal{\Theta}\ (\ n \cdot \log{} n\ )$

### Utility function
Given a dataframe with:
 - Column 0 $\rightarrow$ ID
 - Column 1 $\rightarrow$ Position  
 
Returns a set of all ID couples within *radius* distance

In [6]:
def binarySearchSingle(dataframe, radius):
    # Edges between near cities basing on x position
    # Use of dictionary, in this way search of an element costs O(1)
    edges = {}
    
    # Sort dataframe basing on position O (n log n) using quicksort
    # Use of tmpDataframe to leave dataframe as received
    tmpDataframe = dataframe.sort_values(by = dataframe.columns[1])
    tmpDataframe.reset_index(drop = True, inplace = True)
    
    # O(n)
    for i in tmpDataframe.index:        
        # Set pointers to be used in iterative binary search
        first = 0
        # We just check the left half because we do not need double couples (a, b) and (b, a).
        last = i - 1
        found = False

        # O (log n)
        while first <= last and not found:
            midpoint = (first + last) // 2
        
            # Check if element at midpoint position is near enough            
            if tmpDataframe.iloc[i, 1] - radius <= tmpDataframe.iloc[midpoint, 1]:
                
                # If element at midpoint position is the leftmost element within radius distance
                # i.e. element at (midpoint - 1) position is too far
                if midpoint == 0 or tmpDataframe.iloc[i, 1] - radius > tmpDataframe.iloc[midpoint - 1, 1]:
                
                    # We add to edges all couples composed by (element at i position, element at j position)
                    # for all j from midpoint to i (excluded)
                    edges.update([((tmpDataframe.iloc[i, 0], tmpDataframe.iloc[j, 0]), None) 
                                  for j in range(midpoint, i)])
                    found = True
                
                # Otherwise (element at (midpoint - 1) position is near enough)
                # We search in left half
                else:
                    last = midpoint - 1             
                    
            # Otherwise we must search in right half
            else:
                first = midpoint + 1
                
    return edges

In [7]:
def binarySearchEdges(graph, dataframe, radius):
    xEdges = binarySearchSingle(dataframe, radius)
    yEdges = binarySearchSingle(dataframe.iloc[:, 0::2],  radius)
            
    # O(n)
    for k in xEdges.keys():
        # Searching both for (a,b) and (b,a)
        # O(1)
        if k in yEdges or k[::-1] in yEdges:
            graph.add_edge(*k)

# Ex. # 1
Build the graph of provinces P using NetworkX. Each node corresponds to a city and two cities a and b are connected by an edge if the following holds: if x,y is the position of a, then b is in position z,w with z in [x-d,x+d] and w in [y-d, y+d], with d=0.8. The graph is symmetric. Use the latitude and longitude information available in the files to get the position of the cities. This task can be done in several ways. Use the one you think is more efficient.

## Set up variables

In [8]:
P_all_couples = nx.Graph()
P = nx.Graph()
radius = 0.8

## Create Graph using algorithm # 1

In [9]:
%%timeit
P_all_couples.clear()
allCoupleEdges(P_all_couples, city_dataframe, radius)

377 ms ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Create Graph using algorithm # 2

In [10]:
%%timeit
P.clear()
binarySearchEdges(P, city_dataframe, radius)

56.3 ms ± 5.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Checking results of algorithms

In [11]:
if P.nodes != P_all_couples.nodes:
    raise Exception("P.nodes != P_all_couples.nodes")
    
if P.edges != P_all_couples.edges:
    raise Exception("P.edges != P_all_couples.edges")

# Ex. # 2
Generate 2000 pairs of double (x,y) with x in [30,50) and y in [10,20). Repeat the algorithm at step 1, building a graph R using NetworkX where each pair is a node and two nodes are connected with the same rule reported above, still with d=0.08. If the algorithm at step 1 takes too long, repeat step 1. Note that here d=0.08 (and not 0.8 as in the previous item), as in this way the resulting graph is sparser.

## Set up variables

In [12]:
radius = 0.08
R = nx.Graph()
xMin = 30
xMax = 50
yMin = 10
yMax = 20
couples_count = 2000
points_dataframe = pd.DataFrame()
# Generate column x with couples_count rows of elements in [xMin, xMax)
points_dataframe['x'] = np.random.random_sample(couples_count) * (xMax - xMin) + xMin
# Generate column y with couples_count rows of elements in [yMin, yMax)
points_dataframe['y'] = np.random.random_sample(couples_count) * (yMax - yMin) + yMin
# Generate column label with couples_count rows of (x value, y value)
points_dataframe['label'] = "(" + points_dataframe['x'].astype(str) + ", " + points_dataframe['x'].astype(str) + ")"
points_dataframe = points_dataframe[['label', 'x', 'y']]
# Remove duplicates
###################### TODO

In [13]:
points_dataframe

Unnamed: 0,label,x,y
0,"(49.232805792820145, 49.232805792820145)",49.232806,16.928653
1,"(37.28747161119566, 37.28747161119566)",37.287472,11.185964
2,"(45.18925508312194, 45.18925508312194)",45.189255,16.074212
3,"(39.3974922699407, 39.3974922699407)",39.397492,16.434956
4,"(47.93525418445411, 47.93525418445411)",47.935254,13.085357
...,...,...,...
1995,"(33.052153882877285, 33.052153882877285)",33.052154,16.591515
1996,"(39.43621914786846, 39.43621914786846)",39.436219,10.388264
1997,"(33.648460025236794, 33.648460025236794)",33.648460,18.739111
1998,"(42.826471635539264, 42.826471635539264)",42.826472,10.141811


#dLong = dataframe.sort_values(by='long')

#datas = pd.DataFrame(d)

# O(n) Cleaning data
# Removing unusable latitude / longitude
#for i in range(len(d)):
#    if d[i]['lat'] != 0 and d[i]['long'] != 0:
#        dLong.append(d[i])

# O(n log n)
#dLong.sort(key=sortLongFun)

#dLat = dLong.copy()

# O(n log n)
#dLat.sort(key=sortLatFun)  



import pixiedust
import pdb

def sortLatFun(elem):
    return elem['lat']

def sortLongFun(elem):
    return elem['long']

def my_print(d):
    for i in range(len(d)):
        display(d[i]['lat'])
        
'''
def binarySearch(list, item_index):
    first = 0
    last = len(list)-1

    while first <= last:
        midpoint = (first + last)//2
        if item_index
           list[item_index]['lat'] - radius <= list[midpoint]['lat'] and \
           list[item_index]['lat'] + radius >= list[midpoint]['lat'] and \
           list[item_index]['long'] - radius <= list[midpoint]['long'] and \
           list[item_index]['long'] + radius >= list[midpoint]['long']:
            
            
        else:
            if item < list[midpoint]:
                last = midpoint-1
            else:
                first = midpoint+1

    return found
'''
'''
def binarySearch(d, item, item_index):
    if len(d) == 0:
        return
    
    midpoint = len(d) // 2
    HEAD = "len:" + str(len(d)) + ", midpoint: " + str(midpoint) + ", item_lat: " + str(item['lat']) + ", midpoint_lat: " + str(d[midpoint]['lat'])
    my_print(d)

    if item['lat'] - radius <= d[midpoint]['lat'] and \
       item['lat'] + radius >= d[midpoint]['lat']:
        if len(d) >= midpoint and midpoint != 0:
            display(HEAD + " - Calling: d[:" + str(midpoint) + "] ")
            binarySearch(d[:midpoint], item, item_index)
        
        if len(d) > midpoint+1:
            display(HEAD + " - Calling: d[" + str(midpoint+1) + ":] ")
            binarySearch(d[midpoint+1:], item, item_index)
            
        if item['long'] - radius <= d[midpoint]['long'] and \
           item['long'] + radius >= d[midpoint]['long'] and \
           item != d[midpoint]:
            display(HEAD + " - Adding edge")
            P.add_edge(midpoint, item_index)
    else:
        if item['lat'] < d[midpoint]['lat'] and \
           len(d) >= midpoint and \
           midpoint != 0:
            display(HEAD + " - Else Calling: d[:" + str(midpoint) + "] ")
            binarySearch(d[:midpoint], item, item_index)
        elif len(d) > midpoint + 1:
                display(HEAD + " - Else Calling: d[" + str(midpoint+1) + ":] ")
                binarySearch(d[midpoint+1:], item, item_index)
'''
'''
def binarySearch(d, item):
    midpoint = len(d)//2
    HEAD = "len:" + str(len(d)) + ", midpoint " + str(midpoint) + ", " + str(item['lat']) + ", " + str(d[midpoint]['lat'])
    display(HEAD)
    if item['lat'] - radius <= d[midpoint]['lat'] and \
       item['lat'] + radius >= d[midpoint]['lat']:
        if len(d) >= midpoint:
            display(HEAD + " - Calling: d[:" + str(midpoint) + "] ")
            binarySearch(d[:midpoint], item)
        
        if len(d) > midpoint+1:
            display(HEAD + " - Calling: d[" + str(midpoint+1) + ":] ")
            binarySearch(d[midpoint+1:], item)
            
        if item['long'] - radius <= d[midpoint]['long'] and \
           item['long'] + radius >= d[midpoint]['long'] and \
           item != midpoint:
            display(HEAD + " - Adding edge")
            P.add_edge(midpoint, item)
    else:
        if item['lat'] < d[midpoint]['lat'] and len(d) >= midpoint:
            display(HEAD + " - Else Calling: d[:" + str(midpoint) + "] ")
            binarySearch(d[:midpoint], item)
        elif len(d) > midpoint + 1:
                display(HEAD + " - Else Calling: d[" + str(midpoint+1) + ":] ")
                binarySearch(d[midpoint+1:], item)
'''                 

import random
display(len(d))
d = random.sample(d, 1500)