In [1]:
from data import loader
import pdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import matplotlib as mpl

In [2]:
mpl.rcParams['figure.dpi'] = 1000
d1 = loader.get_global_case_and_deaths_time_series_data()
continent_list = loader.get_available_and_supported_continents()  # In case you want to see continents supported to pass into the call above
print(continent_list)

['Africa', 'Asia', 'Europe', 'NorthAmerica', 'SouthAmerica']


In [3]:
availableRegions = ["Africa", "Asia"]

In [4]:
# Get slices for each country
def get_sliced_vectors(country_name, peak_sets, time_series_data, bucket_length):
    """
    Input: 
        country_name (str): exist in both peak_set and time_series_data as an 'Admin0' entry
        peak_sets (data frame): the set of all peak data
        time_series_data (Tuple): case + death time series data
    Returns:
        sliced_case: a processed vector of case count
        sliced_death: a processed vector of death count
    """
    peak_points = peak_sets.loc[peak_sets['Admin0'] == country_name]
    peak_num = np.array(["X" in x for x in peak_points.columns]).sum()
    sliced_case = np.zeros([peak_num])
    sliced_death = np.zeros([peak_num])
    vectors = (sliced_case, sliced_death)
    for df_idx, df in enumerate(time_series_data):
        total_days = np.array(["Day" in x for x in df.columns]).sum()
        groups = []
        for bucket in range(peak_num):
            # Get the columns for each bucket, which is centered at X_i (a peak)
            begin = 0 if peak_points["X" + str(bucket + 1)].item() - bucket_length / 2 <= 0 else peak_points["X" + str(bucket + 1)].item() - int(bucket_length / 2)
            end = total_days if peak_points["X" + str(bucket + 1)].item() + bucket_length / 2 >= total_days else peak_points["X" + str(bucket + 1)].item() + int(bucket_length / 2)
            groups.append(["Day " + str(day) for day in range(begin, end)])
        
        # Compute the bucket values
        for idx in range(peak_num):
            vectors[df_idx][idx] = df.loc[df['Admin0']==country][groups[idx]].sum(axis=1)
    return vectors

In [5]:
def compute_similarity(v1, v2, type="cosine"):
    """
    Compute similarity between two vectors, v1 and v2
    type: str, can be 'cosine', 'euclidean', 'pearsonCorrelation'
    Returns:
        the similarity score (float) between v1 and v2
    """
    if type == "cosine":
        return v1.dot(v2)/(np.linalg.norm(v1) * np.linalg.norm(v2))
    if type == "euclidean":
        return np.linalg.norm(v1 - v2)
    if type == "pearsonCorrelation":
        return np.cov(np.numpy([v1, v2]))
    return None

In [9]:
data_map = {}
for region in availableRegions:
    peak_data = loader.get_peak_data(region=region, peak_num=4)
    for country in peak_data["Admin0"]:
        case_vec, death_vec = get_sliced_vectors(country, peak_data, d1, 40)
        data_map[country] = (case_vec, death_vec)

In [13]:
def plot_sim_scores_for_paper(sim_matrix, terr_names, data_type, similarity_type, continent="Africa"):
    """
    Plot the similarity scores between given country
    and save them to "results/similarity scores/"
    Different from the scratch one, this one does not have title for figs
    """
    if len(terr_names) > 35:
        label_size = 5
    else:
        label_size = 7

    plt.imshow(sim_matrix, vmin=0, vmax=1)
    plt.colorbar()

    ax = plt.gca()

    ax.set_xticks([i for i in range(len(terr_names))])
    # ax.set_xticklabels(terr_names, Rotation=90)
    ax.set_xticklabels(terr_names)
    for label in ax.get_xticklabels():
        label.set_rotation(90)
    ax.set_yticks([i for i in range(len(terr_names))])
    ax.set_yticklabels(terr_names)
    ax.tick_params(axis='both', which='major', labelsize=label_size)
    
    if continent not in availableRegions:
        plt.savefig("results/similarity scores/" + similarity_type + "/countries/" + data_type + "_" + continent + ".png", transparent=False, facecolor='white')
    else:
        plt.savefig("results/similarity scores/"  + similarity_type + "/continents/" + data_type + "_" + terr_names[0] + ".png", transparent=False, facecolor='white')
    plt.clf()

In [16]:
# Compute similarity
neighbors = pd.read_csv("data/neighbor_map/neighbors_world.csv")
for base_country in d1[0]["Admin0"]:
    if neighbors.loc[neighbors["Country or territory"]==base_country]["neighbor list"].tolist() and type(neighbors.loc[neighbors["Country or territory"]==base_country]["neighbor list"].tolist()[0]) is str:
                list_of_neighbors = neighbors.loc[neighbors["Country or territory"]==base_country]["neighbor list"].tolist()[0].split(",")
    else:
        continue
    list_of_neighbors.insert(0, base_country)
    # Remove the neighbors not in the list and leading white space
    actual_list = []
    for idx in range(len(list_of_neighbors)):
        list_of_neighbors[idx] = list_of_neighbors[idx].lstrip()
        if list_of_neighbors[idx] in data_map:
            actual_list.append(list_of_neighbors[idx])
    list_of_neighbors = actual_list
    sim_mat = np.zeros([len(list_of_neighbors), len(list_of_neighbors)])
    for country_idx, country in enumerate(list_of_neighbors):
        for target_country_idx, target_country in enumerate(list_of_neighbors):
            sim_mat[country_idx, target_country_idx] = compute_similarity(data_map[country][0], data_map[target_country][0])
    if len(list_of_neighbors) > 0:
        print(base_country + ": ", sim_mat)
        plot_sim_scores_for_paper(sim_mat, list_of_neighbors, "case", "cosine", base_country)

Afghanistan:  [[1.         0.48402145 0.7899985  0.59416024 0.47056308]
 [0.48402145 1.         0.80819657 0.61137239 0.80607176]
 [0.7899985  0.80819657 1.         0.89881528 0.86280884]
 [0.59416024 0.61137239 0.89881528 1.         0.8584287 ]
 [0.47056308 0.80607176 0.86280884 0.8584287  1.        ]]
Algeria:  [[1.         0.61784322 0.85796812 0.98887214 0.44459368 0.72193377
  0.61355149]
 [0.61784322 1.         0.7963605  0.64552607 0.83125235 0.62655184
  0.96613718]
 [0.85796812 0.7963605  1.         0.88986637 0.63787429 0.69674931
  0.74595472]
 [0.98887214 0.64552607 0.88986637 1.         0.46697323 0.70522446
  0.63973684]
 [0.44459368 0.83125235 0.63787429 0.46697323 1.         0.73349986
  0.90252391]
 [0.72193377 0.62655184 0.69674931 0.70522446 0.73349986 1.
  0.66715729]
 [0.61355149 0.96613718 0.74595472 0.63973684 0.90252391 0.66715729
  1.        ]]
Angola:  [[1.         0.94241743 0.74335055 0.86792156]
 [0.94241743 1.         0.63481076 0.82861114]
 [0.74335055 0.

<Figure size 6000x4000 with 0 Axes>