# Cutting Trajectories

In [1]:
import os
os.chdir("/home/tales/dev/master/mdc_analysis/")
print("working dir", os.getcwd())

working dir /home/tales/dev/master/mdc_analysis


In [2]:
import pandas as pd
import numpy as np
import math
import copy

from bokeh.models import PrintfTickFormatter
from bokeh.io import output_notebook, show

from src.dao import csv_dao, objects_dao
from src.plot.basic_plot import plot_result
from src.similarity.extreme_travelers import sequence_report
from src.data_processment.input_data_version2 import InputDataManager
from src.utils.time_utils import human_time

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [7]:
output_notebook()

## Loading User Data

In [None]:
# users_srg = objects_dao.load_all_stop_region_group_object()
# users_tags = objects_dao.load_users_tags_sequence(sr_stay_time_above_h=0.5)["orignal"]

In [8]:
users_gps_dir = os.listdir("outputs/user_gps/")
users_gps = {}

for user_gps_dir in users_gps_dir:
    user = user_gps_dir.split("_")[0]
    users_gps[user] = csv_dao.load_user_gps_csv(user)

## Gaps

How many gaps above 06h ? <br> 
How many gaps above 12h ? <br> 
How many gaps above 24h ? <br>
How many gaps above 48h ? <br>
How much time the user has participated?

In [None]:
def gap_missing_values(report):
    t2 = report.iloc[1:len(report)]["local_time"].astype(float).reset_index(drop=True)
    t1 = report.iloc[0:len(report) - 1]["local_time"].astype(float).reset_index(drop=True)
    gaps = t2 - t1
    gaps.index = report.iloc[1:len(report)]["local_time"]
    head = pd.Series([None], index=[t1.iloc[0].item()])
    return head.append(gaps)

def gap_amount(gap, min_time_h):
    present_data_sum = gap[gap <= 60 * 60 * min_time_h].sum()
    missing_data_sum = gap[gap > 60 * 60 * min_time_h].sum()
    
    return {"missing_data_sum": missing_data_sum, "present_data_sum": present_data_sum}

def user_time_participation(report):
    first = report["local_time"].min()
    last = report["local_time"].max()
    
    n_days = (last - first) / (60*60*24)
    
    return {"n_days": n_days,
            "n_years": n_days / 365.25,
            "datetime_start": human_time(first, datetime_format='%d-%m-%Y %H:%M:%S')["datetime"],
            "datetime_end": human_time(last, datetime_format='%d-%m-%Y %H:%M:%S')["datetime"]}
    
def gaps_summary(gaps, min_time_h):
    selection = gaps[gaps > 60 * 60 * min_time_h]
    return {"n_gaps": len(selection),
            "median_sec": selection.median(),
            "min_time_gap": min_time_h}

def merge_dicts(dicts):
    final_dict = {}
    
    for d in dicts:
        
        for k in d.keys():
            final_dict[k] = d[k]

    return final_dict

def gaps_week(gaps, user_data, min_time_h):    
    gaps_df = pd.DataFrame({"gaps": gaps.tolist(), "local_time": user_data["local_time"][0:-1].tolist()})
    gaps_df["week"] = ((gaps_df["local_time"] - gaps_df["local_time"].min()) / (60*60*24*7)).apply(math.floor)
    
    return gaps_df[gaps_df["gaps"] > 60*60*min_time_h]

In [None]:
users_gaps_summary = []

for user_id in users_gps.keys():
    user_gps = users_gps[user_id]
   
    if len(user_gps) == 0:
        continue

    gaps = gap_missing_values(user_gps)
    
    user_data = {"user_id": user_id}
    user_time = user_time_participation(user_gps)
    
    for hour in [6, 12, 18, 24, 30, 36, 42, 48]:
        user_gaps_report = gaps_summary(gaps, hour)
        gap_time_sum = gap_amount(gaps, hour)
        
        users_gaps_summary.append(merge_dicts([user_data, user_time, user_gaps_report, gap_time_sum]))

users_gaps_summary = pd.DataFrame(users_gaps_summary)[["user_id", "min_time_gap", "n_gaps", "median_sec", "missing_data_sum", "present_data_sum", "datetime_end", "datetime_start", "n_years", "n_days"]]

### Counting number of gaps 
Gaps grater than 6, 12, 18, 24, 30, 36, 42 and 48 hours

In [None]:
users_gaps_summary.groupby("min_time_gap")["n_gaps"].median().plot.bar(title="Gaps frequency median by minimum time")

### Counting frequency of gaps per day 
Gaps grater than 6, 12, 18, 24, 30, 36, 42 and 48 hours

In [None]:
users_gaps_summary["gaps_per_day"] = users_gaps_summary["n_gaps"] / users_gaps_summary["n_days"]
users_gaps_summary.groupby("min_time_gap")["gaps_per_day"].median().plot.bar(title="Gaps per day median for each min_time gap")

### What is the proportion of total gaps times 
Gaps grater than 6, 12, 18, 24, 30, 36, 42 and 48 hours

In [None]:
users_gaps_summary["valid_amount_h"] = users_gaps_summary["missing_data_sum"] / (users_gaps_summary["missing_data_sum"] + users_gaps_summary["present_data_sum"])
users_gaps_summary["valid_amount_h"] = users_gaps_summary["valid_amount_h"].astype(float)

In [None]:
users_gaps_summary.groupby("min_time_gap")["valid_amount_h"].median().plot.bar(title="Proportion of missing data among all data")

# Cutting Trajectories

In [None]:
# user_tags_sequence = objects_dao.load_users_tags_sequence()

In [9]:
def cut_traj_in_trips(user_gps_data, gap_tresh_minutes):
    start_stop_gaps = []
    
    gaps_data = gap_missing_values(user_gps_data).to_frame().reset_index().rename({0: "gap_time", "index": "stop"}, axis=1)
    
    gaps_data["start"] = [None] + gaps_data.iloc[0 : len(gaps_data) - 1]["stop"].tolist()
    gaps_data["start"] = gaps_data["start"].replace({np.NaN: None})
    
    selected_gaps = gaps_data[gaps_data["gap_time"] > gap_tresh_minutes * 60]
    
    return selected_gaps[["gap_time", "start", "stop"]]
    
gaps_times = cut_traj_in_trips(users_gps["5937"], 30)
gaps_times.head(6)

NameError: name 'gap_missing_values' is not defined

# Applying Trajectories Cut To InputDataManager

In [None]:
# srs = objects_dao.load_all_stop_region_group_object()

In [None]:
# tags = objects_dao.load_users_tags_sequence()

In [None]:
# srs["5954"].sequence_report(enrich_columns=True)[["start_time", "end_time", "stay_time_h"]]

In [None]:
# tags["original"]["5954"]

In [3]:
data_manager = InputDataManager(use_cache=False)

Loading Users Sequence Report
Loading user_id: 6015 - 1 out of 163
Loading user_id: 6086 - 2 out of 163
Loading user_id: 6014 - 3 out of 163
Loading user_id: 6057 - 4 out of 163
Loading user_id: 6181 - 5 out of 163
Loading user_id: 6069 - 6 out of 163
Loading user_id: 5985 - 7 out of 163
Loading user_id: 5970 - 8 out of 163
Loading user_id: 6062 - 9 out of 163
Loading user_id: 6042 - 10 out of 163
Loading user_id: 5966 - 11 out of 163
Loading user_id: 6067 - 12 out of 163
Loading user_id: 5949 - 13 out of 163
Loading user_id: 5938 - 14 out of 163
Loading user_id: 5937 - 15 out of 163
Loading user_id: 5980 - 16 out of 163
Loading user_id: 6075 - 17 out of 163
Loading user_id: 6028 - 18 out of 163
Loading user_id: 5968 - 19 out of 163
Loading user_id: 6036 - 20 out of 163
Loading user_id: 6023 - 21 out of 163
Loading user_id: 6078 - 22 out of 163
Loading user_id: 6031 - 23 out of 163
Loading user_id: 5927 - 24 out of 163
Loading user_id: 5942 - 25 out of 163
Loading user_id: 6030 - 26 ou

In [4]:
input_data = {}

for version in data_manager.avaliable_versions():
    print("version:", version)
    input_data[version] = {}
    for sr_min_time in [5,10,15,20,25,30]:
        print("sr_min_time:", sr_min_time)
        input_data[version][sr_min_time] = data_manager.get_input_data(version=version, sr_stay_time_minutes=sr_min_time)


version: markov-0.0
sr_min_time: 5
sr_min_time: 10
sr_min_time: 15
sr_min_time: 20
sr_min_time: 25
sr_min_time: 30
version: 0.0.categ_v1
sr_min_time: 5
sr_min_time: 10
sr_min_time: 15
sr_min_time: 20
sr_min_time: 25
sr_min_time: 30
version: 0.1.categ_v1
sr_min_time: 5
sr_min_time: 10
sr_min_time: 15
sr_min_time: 20
sr_min_time: 25
sr_min_time: 30


In [13]:
users_gps.keys()

dict_keys(['5957', '6027', '6029', '5982', '6005', '6014', '5963', '5927', '6178', '6056', '6001', '6051', '5956', '6031', '6016', '5973', '5922', '6037', '5992', '6045', '5990', '6082', '5945', '5448', '5955', '5951', '5950', '5952', '6187', '6024', '5964', '6015', '5970', '6219', '6093', '6061', '6198', '5451', '5921', '6109', '5542', '5943', '6174', '5949', '6190', '5477', '5480', '5959', '5989', '5988', '6167', '6199', '6003', '5954', '6058', '6043', '5518', '6090', '5936', '5972', '6034', '6106', '6179', '5447', '6060', '6181', '6182', '5986', '6072', '6175', '6076', '6041', '5449', '6069', '5484', '6170', '6086', '5958', '6064', '6189', '5979', '6063', '5987', '6084', '6053', '5968', '6166', '6103', '6040', '6030', '6192', '6085', '6194', '6091', '6062', '5479', '5940', '6073', '5942', '6007', '5941', '6171', '6087', '6221', '6026', '6033', '5966', '6183', '5993', '6038', '6002', '5985', '6067', '6042', '6025', '5582', '5946', '6017', '5462', '5978', '6068', '6096', '6169', '5944

In [15]:
for version in input_data.keys():
    for sr_min_time in input_data[version].keys():
        print(version, sr_min_time, len(input_data[version][sr_min_time]["5960"]))

markov-0.0 5 138
markov-0.0 10 109
markov-0.0 15 104
markov-0.0 20 99
markov-0.0 25 96
markov-0.0 30 92
0.0.categ_v1 5 19
0.0.categ_v1 10 19
0.0.categ_v1 15 19
0.0.categ_v1 20 19
0.0.categ_v1 25 19
0.0.categ_v1 30 19
0.1.categ_v1 5 1
0.1.categ_v1 10 1
0.1.categ_v1 15 1
0.1.categ_v1 20 1
0.1.categ_v1 25 1
0.1.categ_v1 30 1


In [None]:
input_data["0.0.categ_v1"][sr_min_time]["5960"]