In [24]:
import datetime
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import seaborn as sns

'''This function computes the cdf of the list we pass'''
def get_cdf(data):
    data.sort()
    data_cdf= []
    frac = 1 / len(data)
    cdf = 0
    for i in data:
        cdf = cdf + frac
        point = [i,1-cdf]
        data_cdf.append(point)
    data_cdf_df = pd.DataFrame(data_cdf,columns = ['AA','Fraction'])
    return data_cdf_df


'''This function summarizes the contacts that each device had. It takes all initial contacts and those classified 
by the ML model as well as  as input'''

def get_contacts_sets(all_contacts, classified_contacts):
    
    f = open(all_contacts, "r")
    frequent = {}
    random = {}
    index = 0
    on_off_contacts = {}
    all_contacts = {}
    users = {}
    for line in f:
        line = line.rstrip("\n")
        line_split = line.split('\t')
        index = index +1 
        dev_a = int(line_split[0])
        dev_b = int(line_split[1])
        count = int(line_split[2])
        days_a = int(line_split[7])
        days_b = int(line_split[8])
        overlap = int(line_split[9])
        ## remove pairs that were not visible for at least 7 days 
        if days_a<7 or days_b<7 or overlap<7:
            continue
        users[dev_a] = 1
        users[dev_b] = 1
        if dev_a in all_contacts:
            all_contacts[dev_a][dev_b] = 1
        else:
            all_contacts[dev_a] = {}
            all_contacts[dev_a][dev_b] = 1
        if dev_b in all_contacts:
            all_contacts[dev_b][dev_a] = 1
        else:
            all_contacts[dev_b] = {}
            all_contacts[dev_b][dev_a] = 1
        ## here we store at pairs that were visible on more than a single day
        if count>1:
            if dev_a in frequent:
                frequent[dev_a][dev_b] = 1
            else:
                frequent[dev_a] = {}
                frequent[dev_a][dev_b] = 1
            if dev_b in frequent:
                frequent[dev_b][dev_a] = 1
            else:
                frequent[dev_b] = {}
                frequent[dev_b][dev_a] = 1
        ## here we store one-off contacts 
        else:
            if dev_a in on_off_contacts:
                on_off_contacts[dev_a][dev_b] = 1
            else:
                on_off_contacts[dev_a] = {}
                on_off_contacts[dev_a][dev_b] = 1
            if dev_b in on_off_contacts:
                on_off_contacts[dev_b][dev_a] = 1
            else:
                on_off_contacts[dev_b] = {}
                on_off_contacts[dev_b][dev_a] = 1
    f.close()
    ## We now check all classified contacts, remember that these are one-off contacts but some can be known close
    ## contacts i.e. frequent while others can be random 
    
    f = open(classified_contacts, "r")
    for line in f:
        line = line.rstrip("\n")
        line_split = line.split('\t')
        dev_a = int(line_split[0])
        dev_b = int(line_split[1])
        ind = int(line_split[2])
        if ind == 1:
            if dev_a in frequent:
                frequent[dev_a][dev_b] = 1
            else:
                frequent[dev_a] = {}
                frequent[dev_a][dev_b] = 1
            if dev_b in frequent:
                frequent[dev_b][dev_a] = 1
            else:
                frequent[dev_b] = {}
                frequent[dev_b][dev_a] = 1
        else:
            if dev_a in random:
                random[dev_a][dev_b] = 1
            else:
                random[dev_a] = {}
                random[dev_a][dev_b] = 1
            if dev_b in random:
                random[dev_b][dev_a] = 1
            else:
                random[dev_b] = {}
                random[dev_b][dev_a] = 1


    
    return frequent,random,users 

''' this function plots figure 3a, it takes as input the time series of the number of different types of contacts 
per day. This time series is generated following the classification of all contacts by the approach explained in the 
Methods and Supplementary Note 5'''

def plot_contacts_split_time_series(input_time_series):
    f = open(input_time_series, "r")
    x = []
    y1 = []
    y2 = []
    for line in f:
        line_split = line.split()
        x.append(line_split[0])
        freq  = int(line_split[3])
        rand  = int(line_split[4])
        total = freq + rand 
        pct_r = 100*freq/total
        y1.append(pct_r)
        y2.append(100.0-pct_r)
    
    f, p1 = plt.subplots(figsize=(8, 4))
    plt.ylim(80,100)
    p1.set_xlabel("Date",fontname='DejaVu Sans', fontsize=18)
    p1.set_ylabel("Percentage of contacts",fontname='DejaVu Sans', fontsize=18)
    p1.tick_params(labelsize=14)
    sp = plt.stackplot(x,y1, y2,labels=['Frequent contacts','Random contacts'])
    ax = plt.gca()
    for index, label in enumerate(ax.xaxis.get_ticklabels()):
        if index % 5 != 0:
            label.set_visible(False)

    plt.legend(loc='center')


''' this function plots figure 3b, it takes as input the contacts_sets that get_contacts_sets identifies'''

def plot_max_contact_duration(all_links,close_contacts_set,random_contacts_set):
    data_freq = []
    data_infreq = []
    '''read max durations for all contacts'''
    f = open(all_links, "r")
    contact_dur = {}
    for line in f:
        line = line.rstrip("\n")
        line_split = line.split('\t')
        dev_a = int(line_split[0])
        dev_b = int(line_split[1])
        dur = int(line_split[5])
        if dev_a in contact_dur:
            contact_dur[dev_a][dev_b] = dur
        else:
            contact_dur[dev_a] = {}
            contact_dur[dev_a][dev_b] = dur
    
    ''' split contact durations for close and random contacts'''
    for dev_a in close_contacts_set:
        for dev_b in close_contacts_set[dev_a]:
            dur = contact_dur[dev_a][dev_b]
            data_freq.append(dur)
    for dev_a in random_contacts_set:
        for dev_b in random_contacts_set[dev_a]:
            dur = contact_dur[dev_a][dev_b]
            data_infreq.append(dur)
    f, p1 = plt.subplots(figsize=(8, 4))
    p1.set(xscale="log")
    p1.set_xlabel("Max contact duration (minutes)",fontname='DejaVu Sans',fontsize=18)
    p1.set_ylabel("Density",fontname='DejaVu Sans',fontsize=18)
    p1.tick_params(labelsize=14)
    p1=sns.kdeplot(data_freq, shade=True, color="b",label='Close contacts')
    p1=sns.kdeplot(data_infreq, shade=True, color="r",label='Random contacts')
    p1.legend()

''' this function plots figure 3c, it takes as input the contacts_sets that get_contacts_sets identifies'''
def plot_number_of_contacts_density(users, close_contacts_set,random_contacts_set):
    random=[]
    close=[]
    for dev in users:
        my_rand = 0
        my_close = 0
        if dev in close_contacts_set:
            my_close = len(close_contacts_set[dev])
        if dev in random_contacts_set:
            my_rand = len(random_contacts_set[dev])
        total = my_rand + my_close
        random.append(my_rand)
        close.append(my_close)
    f, p1 = plt.subplots(figsize=(8, 4))
    p1.set_xlabel("Number of contacts",fontname='DejaVu Sans',fontsize=18)
    p1.set_ylabel("Density Close contacts",fontname='DejaVu Sans', color="b",fontsize=18)
    p1.tick_params(labelsize=14)

    plt.xlim(0,40)
    p1=sns.kdeplot(close, shade=True, color="b")
    ax2 = p1.twinx()  # instantiate a second axes that shares the same x-axis
    ax2.tick_params(labelsize=14)
    color = 'tab:red'
    ax2.set_ylabel('Density Random contacts',fontname='DejaVu Sans', color=color, fontsize=18)  # we already handled the x-label with ax1
    ax2=sns.kdeplot(random, shade=True, color="r")
    ax2.tick_params(axis='y', labelcolor=color)
    
'''this function plots figure 3d, it takes as input the contacts_sets that get_contacts_sets identifies'''
def plot_ccdf_random_contacts_percentage(users, close_contacts_set,random_contacts_set):
    overall = []
    x = []
    y = []
    z = []

    for dev in users:
        my_rand = 0
        my_close = 0
        if dev in close_contacts_set:
            my_close = len(close_contacts_set[dev])
        if dev in random_contacts_set:
            my_rand = len(random_contacts_set[dev])
        total = my_rand + my_close     
        ratio = 100*my_rand/total
        ## find percentage of random contacts for users with different degrees 
        overall.append(ratio)
        if total>=10:
            ratio = 100*my_rand/total
            x.append(ratio)
        if total>=30:
            ratio = 100*my_rand/total
            y.append(ratio)
        if total>=50:
            ratio = 100*my_rand/total
            z.append(ratio)
    overall_cdf_df = get_cdf(overall)
    x_cdf_df = get_cdf(x)
    y_cdf_df = get_cdf(y)
    z_cdf_df = get_cdf(z)
    f, p1 = plt.subplots(figsize=(8, 4))
    p1.set_xlabel("Random contacts (%)",fontname='DejaVu Sans',fontsize=18)
    p1.set_ylabel("CCDF",fontname='DejaVu Sans', fontsize=18)
    p1.tick_params(labelsize=14)
    plt.plot(overall_cdf_df['AA'], overall_cdf_df['Fraction'],'-bo')
    plt.plot(x_cdf_df['AA'], x_cdf_df['Fraction'],'-mo')
    plt.plot(y_cdf_df['AA'], y_cdf_df['Fraction'],'-ro')
    plt.plot(z_cdf_df['AA'], z_cdf_df['Fraction'],'-co')
    plt.legend(['All users','> 10 contacts','> 30 contacts', '> 50 contacts'])
    plt.grid(True)
    plt.xscale('log')
    plt.show()
    
