# Calculating unicity for 2 points

Calculation and pickling of `results` and `data_by_point` dictionaries

In [21]:
import csv
import random
from tqdm import tqdm_notebook as tqdm
import datetime
import pandas as pd
from dict_tools import *
import pickle #For saving dictionaries with results
import time #For calculating timings

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [22]:
#FUNCTIONS
def coarsen_time(timestamp,degree):
    '''Coarsen time by the degree `degree`.
    Degree is either 'd' (day), 'm' (minute), 'h' (hour), 's' (second), or False (exclude)
    '''
    if degree == 'd':
        return timestamp[0:10]
    elif degree == 'h':
        return timestamp[0:13]
    elif degree == 'm':
        return timestamp[0:16]
    elif degree == 's':
        return timestamp[0:19]
    else:
        return None
        
unknown_curr = set()
def coarsen_amount(amount,degree,currency):
    '''Coarsen amount by the degree `degree`, which depends on the strength of\
        the `currency`. 
        `degree` can take:
        'm'" max, 'h': High, 'm': Medium, 'l': low
        
        Note that strength is not checked online, so it may not\
        be accurate if the strength of a currency has changed dramatically
        
        Note that rounding may not be perfect owing to floating point issues
    '''
    #Strength of currencies in 2014, I'm going to check these (I think XRP is \
    # now about as strong as the pound)
    
    strong_curr = ('BTC', 'XAG','XAU', 'XPT')
    med_curr = ('CNY','EUR','USD','AUD','GBP','JPY') #<- STRENGTH OF XRP HAS CHANGED SINCE 2015
    weak_curr = ('CCK','STR','KRW','MTL','XRP')
    
    lookup_strong = {'h': 1e-3, 'a': 1e-2, 'l': 1e-1}
    lookup_med = {'h': 1e1, 'a': 1e2, 'l': 1e3}
    lookup_weak = {'h': 1e5, 'a': 1e6, 'l': 1e7}
    if degree is False:
        return None
    elif degree == 'm':
        return amount
    else:
        if currency in strong_curr:
            prec = lookup_strong[degree]
        elif currency in med_curr:
            prec = lookup_med[degree]
        elif currency in weak_curr:
            prec = lookup_weak[degree]
        else:
            unknown_curr.add(currency)
            #print("Unknown currency {}".format(currency))
            prec = 1
    
        coarse_amount = float(int(round(amount/prec)))*prec

        return coarse_amount

In [23]:
#LOAD DATA
start_date = datetime.date(2017,6,1)
end_date = datetime.date(2017,8,29)
date_list = pd.date_range(start_date,end_date,freq="1D")

files = []
for day in date_list:
    #print("Current date: {}".format(day.strftime("%Y-%m-%d")))
    this_day = day.strftime("%Y-%m-%d")
    next_day = (day+datetime.timedelta(days=1)).strftime("%Y-%m-%d")
    files.append("Ripple_transactions_{}_to_{}.csv".format(this_day,next_day))

def load_data_by_sender(files,a_res,include_c,t_res,include_d):
    '''Create dictionary from a list of file names `file` \
        with keys being senders and datapoints by tuple \
        (Amount, Currency, Time, Destination)'''
    data_by_sender = {}
    for file in tqdm(files):
        i = 0
        for line in csv.reader(open(file,'r')):
            if i == 0:
                header = line
            else:
                sender = line[header.index('Sender')]
                
                
                curr = line[header.index('Currency')]
                amount = coarsen_amount(float(line[header.index('Amount')]),a_res,curr)   #<- COARSENING 
                time = coarsen_time(line[header.index('Timestamp')],t_res)       #<- COARSENING
                if include_d:
                    destination = line[header.index('Destination')]
                else:
                    destination = None
                if not include_c:
                    curr = None
                datapoint = tuple([amount,curr,time,destination])
                
                if sender not in data_by_sender:
                    data_by_sender[sender] = set() 
                data_by_sender[sender].add(tuple(datapoint))
            i+=1
    return data_by_sender

In [24]:
def by_point(data_by_sender):
    """
    Returns a dictionary mapping datapoints (Amount, Currency, Time, Destination) to the \
    users who share this datapoint
    
    """
    print("CONVERTING DATA BY SENDER TO DATA BY POINT")
    
    data_by_point = {}
    for sender in tqdm(data_by_sender):
        for datapoint in data_by_sender[sender]:
            if datapoint not in data_by_point:
                data_by_point[datapoint] = set()
            data_by_point[datapoint].add(sender)
    
    return data_by_point

def get_unicity(dataset,p,npeople = 10000):
    """
    Returns unicity (float) of `dataset` (dict), where `p` (int) is number of points sampled for each \
     person (if they have that many points) and `npeople` (int) is the number of people for whom their \
     uniqueness (for a set of points) is calculated
    """
    if npeople == 'all':
        npeople = len(dataset.keys())
    else:
        npeople = int(npeople) #Just to make sure
    
    points_to_users = by_point(dataset)
    
    print("User number: ",len(dataset))
    print("Point number: ",len(points_to_users))

    unique_users = set()
    users = random.sample(list(dataset.keys()),npeople)
    
    users_included = set() #<- Users with at least p points

    for u in tqdm(users):

        u_data = dataset[u]

        if len(u_data) < p: # <- NOT len(u_data) <= p
            continue
        else:
            p_points = random.sample(u_data,p)
            users_included.add(u)
        is_unique = True
        
        similar_people = set(dataset.keys())

        for point in p_points:
            similar_people = similar_people.intersection(points_to_users[point])
   

        if len(similar_people) > 1:
            is_unique = False
            continue #<-NEED TO CONTINUE NOT BREAK!
        
        if is_unique:
            unique_users.add(u) 

    print("Users included:",len(users_included))
    return len(unique_users)/len(users_included) #<-COMPARING WITH NUMBER OF USERS WITH AT LEAST p POINTS

In [25]:
experiments = {"Am,Tsc,C,D":['m',True,'s',True],
               "Am,Tsc,-,D":['m',False,'s',True],
               "Am,Tsc,C,-":['m',True,'s',False],
               "-,Tsc,C,D": [False,True,'s',True],
               "Ah,Tmn,C,D":['h',True,'m',True],
               "Aa,Thr,C,D":['a',True,'h',True],
               "Al,Tdy,C,D":['l',True,'d',True],
               "Am,-,C,D":['m',True,False,True],
               "Am,-,-,-":['m',False,False,False],
               "Al,Tdy,-,-":['l',False,'d',False]
              }
experiments_test = {"Am,Tsc,C,D":['m',True,'s',True],
               "Am,Tsc,C,-":['m',True,'s',False],
               "-,Tsc,C,D":[False,True,'s',True],
               }
def run_experiment(parameters,p,name="unnamed_ripple_experiment",npeople=10000):
    '''Run experiment using resolutions defined in `parameters` \
        (Amount res, Include Currency, Time res, Include Destination)
        and picking p datapoints for the unicity test
        
        Pickles the coarsened dataset object to a file {NAME}_p{P VALUE}-data_by_sender.pkl
        '''
    print("LOADING DATA")
    data_by_sender = load_data_by_sender(files,*parameters)
    filename = '{}_p{}-data_by_sender.pkl'.format(name,p)
    print("PICKLING DATA to {}".format(filename))
    pickle.dump(data_by_sender,open(filename,'wb'))
    print("CALCULATING UNICITY")
    unicity = get_unicity(data_by_sender,p,npeople)
    return unicity

In [30]:
start = time.time()

def run(p,npeople=10000):
    results = dict()
    for exp in experiments:
        print("-------------------------------------------\nEXPERIMENT {}: COARSENING: {}".format(exp,experiments[exp]))
        results[exp] = run_experiment(experiments[exp],p=p,name=exp,npeople=npeople)
    return results

results_p2 = run(p=2)
end = time.time()
time_taken = end-start

-------------------------------------------
EXPERIMENT Am,Tsc,C,D: COARSENING: ['m', True, 's', True]
LOADING DATA

PICKLING DATA to Am,Tsc,C,D_p2-data_by_sender.pkl
CALCULATING UNICITY
CONVERTING DATA BY SENDER TO DATA BY POINT

User number:  89148
Point number:  2310577

Users included: 5596
-------------------------------------------
EXPERIMENT Am,Tsc,-,D: COARSENING: ['m', False, 's', True]
LOADING DATA

PICKLING DATA to Am,Tsc,-,D_p2-data_by_sender.pkl
CALCULATING UNICITY
CONVERTING DATA BY SENDER TO DATA BY POINT

User number:  89148
Point number:  2310571

Users included: 5669
-------------------------------------------
EXPERIMENT Am,Tsc,C,-: COARSENING: ['m', True, 's', False]
LOADING DATA

PICKLING DATA to Am,Tsc,C,-_p2-data_by_sender.pkl
CALCULATING UNICITY
CONVERTING DATA BY SENDER TO DATA BY POINT

User number:  89148
Point number:  2216491

Users included: 5645
-------------------------------------------
EXPERIMENT -,Tsc,C,D: COARSENING: [False, True, 's', True]
LOADING DA

In [31]:
pickle.dump(results_p2,open('results_p2.pkl','wb'))

In [32]:
import winsound
winsound.MessageBeep(winsound.MB_ICONHAND)