In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #to beautify xml output
from lxml import etree as ET #read and write xml files
pd.options.display.max_columns=50

import os
from os import listdir
from os.path import isfile, join

import time


In [None]:
# Read the trips file from SF-CHAMP output
trip=pd.read_table('SF-CHAMP Outputs/_trip_2.dat',sep='\t',header='infer')

In [None]:
mzone = pd.read_csv('SF-CHAMP Outputs/mzone_short.csv') #MZONE coordinates in EPSG 26910 projection
mzone.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
trip=trip[(trip['otaz']<1000)|(trip['dtaz']<1000)]

In [None]:
len(trip)

In [None]:
#decoding the columns

def trip_purpose(x): 
    #(0=home, 1=work, 2=school, 3=escort, 4=personal business (& medical), 
    #5=shopping, 6=meal, 7=social (& recreation), 8=recreation (H version only) 
    #9=medical (H version only), 10=change mode at a park and ride lot
    #according to the sf-light scenario population.xml all the following purposes are valid
    #maybe needs activity-intercepts or activity-parameters
    if x==0:
        return 'home'
    elif x==1:
        return 'work'
    elif x==2:
        return 'school'
    elif x==3:
        return 'escort'
    elif x==4:
        return 'business'
    elif x==5:
        return 'shopping'
    elif x==6:
        return "meal"
    else:
        return 'other'
    
def convert_time(x): 
    #ARRTM, DEPTM, ENDTM etc is in minutes after 3 AM
    return time.strftime("%H:%M:%S", time.gmtime(x*60+180))

def mode_type(df):
    #Trip main mode type (1=walk, 2=bike, 3=sov, 4=hov 2, 5=hov 3+, 
    #6=walk to transit, 7=park and ride, 8=school bus, 9=TNC, 10=other – survey only)
    if df['mode']==1:
        return 'walk'
    elif df['mode']==2:
        return 'bike'
    elif df['mode']==3:
        return 'car'
    elif df['mode']==4:
        #return 'car'
        if df['dorp']==1:
            return 'car'
        else:
            return 'passenger'
    elif df['mode']==5:
        #return 'car'
        if df['dorp']==1:
            return 'car'
        else:
            return 'passenger'
    elif df['mode']==6:
        return 'walk_transit'
    elif df['mode']==7:
        return 'drive_transit'
    elif df['mode']==9:
        return 'ride_hail'
    else:
        return 'other'

def trip_type_location(df):
    if df['otaz']<1000 and df['dtaz']<1000:
        return 'int-int'
    elif df['otaz']<1000 and df['dtaz']>1000:
        return 'int-ext'
    elif df['otaz']>1000 and df['dtaz']>1000:
        return 'ext-ext'
    else:
        return 'ext-int'
    

def data_cleanup(trip_test,mzone):
    
    trip_test['hhno']=trip_test['hhno'].astype(str)
    trip_test['pno']=trip_test['pno'].astype(str)

    trip_test['person_id']=trip_test['hhno']+'-'+trip_test['pno']
    
    trip_test['trip_type']=trip_test.apply(trip_type_location,axis=1)
    
    plans=trip_test[[
    'person_id', #format is hh number-person number
    'trip_type',#i-x type
    'opurp',#The purpose at the trip origin (0=home, 1=work, 2=school, 3=escort, 4=personal business (& medical), 5=shopping, 6=meal, 7=social (& recreation), 8=recreation (H version only) 9=medical (H version only), 10=change mode at a park and ride lot
    'dpurp',#the purpose at trip destination
    #'oadtyp',#Trip origin address type (1=home, 2=usual work location, 3=usual school location, 4=other location in region, 5=out of region/missing (survey data only), 6=inserted change mode location for park and ride
    #'dadtyp',#trip dest. address type
    'opcl',#trip origin parcel
    'dpcl',#trip dest parcel
    #'otaz',#origin TAZ
    #'dtaz',#dest TAZ
    'mode',#Trip mode (1=walk, 2=bike, 3=sov, 4=hov 2, 5=hov 3+, 6=walk to transit, 7=park and ride, 8=school bus, 9=TNC, 10=other – survey only) 
    'dorp',#For auto trips, 1=driver, 2=passenger; for transit trips, is set to the total walk access+egress time, in integer minutes 
    'deptm',#The trip departure time, in minutes after midnight (or hours*100+minute for estimation mode)
    'arrtm',#trip arrival time
    'endacttm'#activity end time
    ]]

    plans['trip_origin_purpose']=plans['opurp'].apply(trip_purpose)
    plans['trip_dest_purpose']=plans['dpurp'].apply(trip_purpose)
    plans['departure_time']=plans['deptm'].apply(convert_time)
    plans['arrival_time']=plans['arrtm'].apply(convert_time)
    plans['end_activity_time']=plans['endacttm'].apply(convert_time)
    plans['mode']=plans.apply(mode_type,axis=1)
    
    plans=plans[['person_id', 'trip_type', 'opcl', 'dpcl',
                 'mode', 'dorp', 'trip_origin_purpose','trip_dest_purpose',
                 'departure_time', 'arrival_time', 'end_activity_time']]
    plans.sort_values(['person_id','departure_time'],inplace=True)
    
#     if trip_type=='all internal':
#         plans_df=plans[plans['trip_type']=='int-int']
#     elif trip_type=='all within':
#         plans_df=plans[plans['trip_type']!='ext-ext']
#     else:
#         plans_df=plans
    
    
    plans_maz_orig=pd.merge(plans,mzone,left_on=['opcl'],right_on=['MAZID'],how='left')

    plans_maz_all=pd.merge(plans_maz_orig,mzone,left_on=['dpcl'],right_on=['MAZID'],how='left')

    plans_maz_all=plans_maz_all.rename(columns={'X_COORD_x':'X_ORIG',
                                                'Y_COORD_x':'Y_ORIG',
                                                'X_COORD_y':'X_DEST',
                                                'Y_COORD_y':'Y_DEST'})

    plans_maz_all.drop(columns=['MAZID_x','MAZID_y'],inplace=True)
    
    plans_maz_all=plans_maz_all.drop_duplicates()
    
    trip_test=None
    mzone=None
    return (plans_maz_all)

In [8]:
%%time
plans_df=data_cleanup(trip,mzone)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plans['trip_origin_purpose']=plans['opurp'].apply(trip_purpose)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plans['trip_dest_purpose']=plans['dpurp'].apply(trip_purpose)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plans['departure_time']=plans['deptm'].apply(convert_time)
A value is trying to

Wall time: 4min 15s


In [11]:
plans_df.trip_type.value_counts()

int-int    3106642
int-ext     605500
ext-int     605500
Name: trip_type, dtype: int64

In [14]:
plans_df[plans_df['person_id']=='5000-1']

Unnamed: 0,person_id,trip_type,opcl,dpcl,mode,dorp,trip_origin_purpose,trip_dest_purpose,departure_time,arrival_time,end_activity_time,X_ORIG,Y_ORIG,X_DEST,Y_DEST
3695239,5000-1,int-int,2131,2878,car,1,home,escort,06:41:00,06:44:00,07:13:00,546706.350363,4173686.0,545914.710889,4174559.0
3695240,5000-1,int-ext,2878,115788,car,1,escort,work,07:13:00,07:27:00,20:10:00,545914.710889,4174559.0,552195.869632,4173278.0
3695241,5000-1,ext-int,115788,2131,car,1,work,home,20:10:00,20:22:00,03:02:00,552195.869632,4173278.0,546706.350363,4173686.0


In [15]:
plans_df['mode'].value_counts()

car             1808326
walk_transit     849330
walk             720999
passenger        564835
ride_hail        245454
bike             121940
other              6758
Name: mode, dtype: int64

In [16]:
plans_df[plans_df['person_id']=='1-1']

Unnamed: 0,person_id,trip_type,opcl,dpcl,mode,dorp,trip_origin_purpose,trip_dest_purpose,departure_time,arrival_time,end_activity_time,X_ORIG,Y_ORIG,X_DEST,Y_DEST
0,1-1,int-int,720,3173,walk_transit,3,home,meal,11:10:00,12:01:00,13:11:00,551662.441744,4173739.0,552908.210961,4182886.0
1,1-1,int-int,3173,720,walk_transit,3,meal,home,13:11:00,14:03:00,03:02:00,552908.210961,4182886.0,551662.441744,4173739.0


In [1]:
from lxml import etree as ET
parser = ET.XMLParser(remove_blank_text=True)

template = ET.parse('SF_CHAMP_Converted/sf-within-int-trips-temp.xml',parser)
population_tag = template.getroot()

In [17]:
def create_xml(plans_dataframe,size):
    
    for i in range(0,len(plans_df.person_id.unique())+1,size):
        
        j=i+size

        test_df=plans_dataframe[plans_dataframe['person_id'].isin(plans_dataframe.person_id.unique()[i:j])]

        parser = ET.XMLParser(remove_blank_text=True)

        template = ET.parse('SF_CHAMP_Converted/sf-within-ix-trips.xml',parser)
        population_tag = template.getroot()

        for i in test_df['person_id'].unique():

            person_id=str(i)

            person_tag = ET.SubElement(population_tag, 'person') 
            person_tag.set('id',person_id)


            plan_tag = ET.SubElement(person_tag, 'plan') 
            plan_tag.set('selected', 'yes')

            df=test_df[test_df['person_id']==i]

            act_tag = ET.SubElement(plan_tag, 'activity') 
            act_tag.set('type',df.iloc[0,df.columns.get_loc('trip_origin_purpose')])
            act_tag.set('end_time',df.iloc[0,df.columns.get_loc('departure_time')])#
            act_tag.set('y',str(df.iloc[0,df.columns.get_loc('Y_ORIG')])) 
            act_tag.set('x',str(df.iloc[0,df.columns.get_loc('X_ORIG')]))

            for x in range(0,len(df.values)):

                leg_tag = ET.SubElement(plan_tag, 'leg') 
                leg_tag.set('mode',df.values[x,df.columns.get_loc('mode')])

                act_tag = ET.SubElement(plan_tag, 'activity')
                act_tag.set('type',df.values[x,df.columns.get_loc('trip_dest_purpose')])
                act_tag.set('end_time',df.values[x,df.columns.get_loc('end_activity_time')])
                act_tag.set('y',str(df.values[x,df.columns.get_loc('Y_DEST')]))
                act_tag.set('x',str(df.values[x,df.columns.get_loc('X_DEST')]))

            population_tag.append(person_tag)


        tree=ET.ElementTree(population_tag)
        tree.write('SF_CHAMP_Converted/sf-within-ix-trips.xml', pretty_print=True, xml_declaration=True,   encoding="utf-8")


In [18]:
%%time
create_xml(plans_df,10000)

Wall time: 5h 12min 33s
