# SnapShot preprocessing:
In this notebook, we process the snapshots to reconstruct the travel diaries.

#### adding required packages

In [26]:
import pandas as pd
from scipy.stats import gaussian_kde
import time
import numpy as np
import pickle
# import concurrent.futures
# from loky import ProcessPoolExecutor
# from threading import Thread
# import logging
# import multiprocessing as mp
# from multiprocess import Pool
from joblib import Parallel, delayed
import datetime as dt

### specifying the saving location 


In [10]:
savingLoc = "Y:/ZahraEftekhar/phase4/"

### Clustering The successive similar records

First, we have to cluster the consecative records that have the same location (i.e., the TAZ).

In [9]:
def cluster_records(interval,address):
    t1= time.time()
    clusterData = {}
    with open("{a}completePLUdata_{b}sec_dict.pickle".format(a=address,b= interval),'rb') as handle:
        snapDataSplit = pickle.load(handle)
    IDs = list(snapDataSplit.keys())
    for i,ID in enumerate(IDs):
        person = snapDataSplit[ID]
        person = person.reset_index(drop=False, inplace=False)
        newD = pd.DataFrame()
        for k,v in person.groupby((person.mzr_id.shift() != person.mzr_id).cumsum()):
            newD = newD.append(v.iloc[0,:])
        d = np.array(newD["TIME"].shift(periods=-1, fill_value=v.iloc[-1,:]["TIME"])) - np.array(newD["TIME"])
        d = [pd.Timedelta(d[i]).total_seconds() for i in np.arange(len(d))]
        newD["duration"] = d
        newD.columns = ['start', 'id', 'x', 'y', 'mzr_id', 'duration']
        clusterData[ID] = newD
    print(interval, ":  ",(time.time()-t1)//60,"minutes")
    with open('{a}clusterData_{b}sec.pickle'.format(a= address,b = interval), 'wb') as handle:
        pickle.dump(clusterData, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
#with this type of parallel processing execution of this cell takes only about 30 minutes for all the intervals! 
# if not using parallel processing it would be more than 2 hours!
intervals = [60,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300,3600,4500,5400,6300,7200]
t2 = time.time()
results = Parallel(n_jobs=6)(delayed(cluster_records)(interval,savingLoc) for interval in intervals)
print(time.time()-t2)   

637.1613354682922


### Location Type Identification

Now, we can use the files from the previous step to reconstruct the travel diaries. This include to first indentify the event type i.e., `stay` or `pass-by`, then identify the acticity type , i.e., `home`, `work` or `other`. 

In [167]:
def locIdentify(interval,address):
    t1= time.time()
    seeds = range(101,126)
    for seed in seeds:
        with open('{a}trainingTrip_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
            trip = pickle.load(handle)
        with open('{a}trainingActivity_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
            activity = pickle.load(handle)
        with open('{a}trainingHome_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
            home = pickle.load(handle)
        with open('{a}trainingWork_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
            work = pickle.load(handle)
        with open('{a}trainingOther_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
            other = pickle.load(handle)
        prior_activity = len(activity)/(len(activity) + len(trip))
        prior_trip = len(trip)//(len(activity) + len(trip))
        prior_home = len(home)/(len(home) + len(work) + len(other))
        prior_work = len(work)/(len(home) + len(work) + len(other))
        prior_other = len(other)/(len(home) + len(work) + len(other))
        with open('{a}clusterData_{b}sec.pickle'.format(a=address,b=interval), 'rb') as handle:
            clusterData = pickle.load(handle)
        identified_clusterData = {}
        for i in clusterData.keys():
            d = np.array(clusterData[i].duration)
            st = np.array(clusterData[i].start.dt.total_seconds())
            zone = clusterData[i].mzr_id
            ps = prior_activity* gaussian_kde(np.array(pd.to_timedelta(activity.duration).dt.total_seconds())).pdf(
                d)* gaussian_kde(np.array(pd.to_timedelta(activity.start).dt.total_seconds())).pdf(st)
            pp = prior_activity* gaussian_kde(np.array(pd.to_timedelta(trip.duration).dt.total_seconds())).pdf(
                d)* gaussian_kde(np.array(pd.to_timedelta(trip.start).dt.total_seconds())).pdf(st)
            logprob_stay = ps/(ps+pp)
            identified_clusterData[i] = pd.DataFrame({"mzr_id":np.array(zone)[np.where(logprob_stay>=0.5)],
                                         'start':np.array(st[np.where(logprob_stay>=0.5)]),
                                         'duration': np.array(d[np.where(logprob_stay >= 0.5)]),
                                         "x":np.array(clusterData[i].x)[np.where(logprob_stay>=0.5)],
                                         "y":np.array(clusterData[i].y)[np.where(logprob_stay>=0.5)],
                                         "id":np.array(clusterData[i].id)[np.where(logprob_stay>=0.5)],
                                         "stayProb":logprob_stay[logprob_stay>=0.5]})
            ph = prior_home * gaussian_kde(np.array(pd.to_timedelta(home.duration).dt.total_seconds())).pdf(
                identified_clusterData[i].duration) * gaussian_kde(np.array((pd.to_timedelta(
                home.start).dt.total_seconds()))).pdf(identified_clusterData[i].start)
            pw = prior_work * gaussian_kde(np.array(pd.to_timedelta(work.duration).dt.total_seconds())).pdf(
                identified_clusterData[i].duration) * gaussian_kde(np.array(pd.to_timedelta(
                work.start).dt.total_seconds())).pdf(
                identified_clusterData[i].start)
            po = prior_other * gaussian_kde(np.array(pd.to_timedelta(other.duration).dt.total_seconds())).pdf(
                identified_clusterData[i].duration) * gaussian_kde(np.array(
                pd.to_timedelta(other.start).dt.total_seconds())).pdf(
                identified_clusterData[i].start)
            identified_clusterData[i]["homeProb"] = ph/(ph+pw+po)
            identified_clusterData[i]["workProb"] = pw / (ph + pw + po)
            identified_clusterData[i]["otherProb"] = po / (ph + pw + po)
            identified_clusterData[i]["activity"] = np.argmax(np.vstack((identified_clusterData[i].homeProb,
                                                                         identified_clusterData[i].workProb,
                                                                         identified_clusterData[i].otherProb)),axis=0)
        print("seed:  ",seed, ",  interval:   ",interval, ",time:   ",(time.time()-t1)//60, " min")
        with open('{a}clusterData_identified_{b}sec_seed{ss}.pickle'.format(a=address,b=interval,ss=seed),
                  'wb') as handle:
            pickle.dump(identified_clusterData, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open('{a}clusterData_identifiedStayOnly_{b}sec_seed{ss}.pickle'.format(a=address,b=interval,ss=seed),
                  'wb') as handle:
            pickle.dump(clusterData, handle, protocol=pickle.HIGHEST_PROTOCOL)
        

In [168]:
intervals = [30,60,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300,3600,4500,5400,6300,7200]
t2 = time.time()
results = Parallel(n_jobs=6)(delayed(locIdentify)(interval,savingLoc) for interval in intervals)
print(time.time()-t2) 

43135.713884830475


In [162]:
t1= time.time()
address = savingLoc
interval=900
seeds = range(101,102)
for seed in seeds:
    with open('{a}trainingTrip_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
        trip = pickle.load(handle)
    with open('{a}trainingActivity_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
        activity = pickle.load(handle)
    with open('{a}trainingHome_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
        home = pickle.load(handle)
    with open('{a}trainingWork_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
        work = pickle.load(handle)
    with open('{a}trainingOther_seed{s}.pickle'.format(a=address,s=seed), 'rb') as handle:
        other = pickle.load(handle)
    prior_activity = len(activity)/(len(activity) + len(trip))
    prior_trip = len(trip)//(len(activity) + len(trip))
    prior_home = len(home)/(len(home) + len(work) + len(other))
    prior_work = len(work)/(len(home) + len(work) + len(other))
    prior_other = len(other)/(len(home) + len(work) + len(other))
    with open('{a}clusterData_{b}sec.pickle'.format(a=address,b=interval), 'rb') as handle:
        clusterData2 = pickle.load(handle)
    
    clusterData = dict((kk, clusterData2[kk]) for kk in IDD)


#         pd.to_timedelta(activity.duration).dt.total_seconds()
    identified_clusterData = {}
    for i in clusterData.keys():
        d = np.array(clusterData[i].duration)
        st = np.array(clusterData[i].start.dt.total_seconds())
        zone = clusterData[i].mzr_id
        ps = prior_activity* gaussian_kde(np.array(pd.to_timedelta(activity.duration).dt.total_seconds())).pdf(
            d)* gaussian_kde(np.array(pd.to_timedelta(activity.start).dt.total_seconds())).pdf(st)
        pp = prior_activity* gaussian_kde(np.array(pd.to_timedelta(trip.duration).dt.total_seconds())).pdf(
            d)* gaussian_kde(np.array(pd.to_timedelta(trip.start).dt.total_seconds())).pdf(st)
        logprob_stay = ps/(ps+pp)
        identified_clusterData[i] = pd.DataFrame({"mzr_id":np.array(zone)[np.where(logprob_stay>=0.5)],
                                     'start':np.array(st[np.where(logprob_stay>=0.5)]),
                                     'duration': np.array(d[np.where(logprob_stay >= 0.5)]),
                                     "x":np.array(clusterData[i].x)[np.where(logprob_stay>=0.5)],
                                     "y":np.array(clusterData[i].y)[np.where(logprob_stay>=0.5)],
                                     "id":np.array(clusterData[i].id)[np.where(logprob_stay>=0.5)],
                                     "stayProb":logprob_stay[logprob_stay>=0.5]})
        ph = prior_home * gaussian_kde(np.array(pd.to_timedelta(home.duration).dt.total_seconds())).pdf(
            identified_clusterData[i].duration) * gaussian_kde(np.array((pd.to_timedelta(
            home.start).dt.total_seconds()))).pdf(identified_clusterData[i].start)
        pw = prior_work * gaussian_kde(np.array(pd.to_timedelta(work.duration).dt.total_seconds())).pdf(
            identified_clusterData[i].duration) * gaussian_kde(np.array(pd.to_timedelta(
            work.start).dt.total_seconds())).pdf(
            identified_clusterData[i].start)
        po = prior_other * gaussian_kde(np.array(pd.to_timedelta(other.duration).dt.total_seconds())).pdf(
            identified_clusterData[i].duration) * gaussian_kde(np.array(
            pd.to_timedelta(other.start).dt.total_seconds())).pdf(
            identified_clusterData[i].start)
        identified_clusterData[i]["homeProb"] = ph/(ph+pw+po)
        identified_clusterData[i]["workProb"] = pw / (ph + pw + po)
        identified_clusterData[i]["otherProb"] = po / (ph + pw + po)
        identified_clusterData[i]["activity"] = np.argmax(np.vstack((identified_clusterData[i].homeProb,
                                                                     identified_clusterData[i].workProb,
                                                                     identified_clusterData[i].otherProb)),axis=0)
print(time.time()-t1)

6.127824068069458
