# Data Cleaning and Preprocessing 

#### adding required packages

In [4]:
# import xml.etree.ElementTree as ET
import time
import pandas as pd
import numpy as np
import xmltodict

### keeping allowed users from the MATSim files

In this step we use the `1.experienced_plans.xml` and `snapShot.CSV` file in the MATsim output folder. We would like to only keep the users with the `car` mode. Furthermore, we remove users with zero duration activities. The users we keep are the same in both of the mentioned files. Finally, after this data cleaning we end up with about 21,000 users. 

#### reading `1.experienced_plans.xml` and converting it to a dataframe:

In [None]:
#we time the process
startTime = time.time()

# reading the MATSim output into a dict file
tree = xmltodict.parse(open("D:/ax/gis/input_base/1.experienced_plans.xml","rb"))
# tree = xmltodict.parse(open("/data/zahraeftekhar/research_temporal/input_base/1.experienced_plans.xml","rb"))

# root is a list of plans of the users 
root = tree['population']['person'] 

#### keeping only users with the `car` mode and removing users with unacceptable travel diary:

Here, we want to extract the user IDs that need to be removed them from our data set. This includes the users with non-positive activity durations, users with less than three activities which show no travel, etc. Also we only consider users with the mode `car`. 

In [None]:
RemoveIDs = [(int)]
for child in root:
    try:
        # remove IDs that did not travel
        if len(child["plan"]["activity"])<3:
            RemoveIDs += [int(child['@id'])]

        # remove IDs that used any mode other than `car`
        elif not all(flag["@mode"] == "car" for flag in child['plan']['leg']):
            RemoveIDs += [int(child['@id'])]

        # remove IDs that their 1st and last activity are not similar
        # this is done to be able to have a round travel diary
        elif child["plan"]["activity"][0]["@type"]!=child["plan"]["activity"][-1]["@type"]:
            RemoveIDs += [int(child['@id'])]

        # remove IDs with zero duration activities:
        # 1) removing `generic`legs because they usually lead to zero duration activities
        elif not all(flag['route']['@type'] != 'generic' for flag in child['plan']['leg']):
            RemoveIDs += [int(child['@id'])]

        # 2) remove the rest of IDs with zero or negative duration activities
        elif not all((pd.to_timedelta(flag['@end_time'])).total_seconds() -
                     (pd.to_timedelta(flag['@start_time'])).total_seconds() > 0
                     for flag in child['plan']['activity'][1:-1]):
            RemoveIDs += [int(child['@id'])]
    except KeyError:
        RemoveIDs += [int(child['@id'])]