In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #to beautify xml output
from lxml import etree as ET #read and write xml files
pd.options.display.max_columns=50

import time

In [2]:
# Read the trips file from SF-CHAMP output
trip=pd.read_table('../SF-CHAMP Outputs/_trip_2.dat',sep='\t',header='infer')

mzone = pd.read_csv('../SF-CHAMP Outputs/mzone_short.csv') #MZONE coordinates in EPSG 26910 projection
mzone.drop(columns=['Unnamed: 0'],inplace=True)
tour=pd.read_table('../SF-CHAMP Outputs/_tour_2.dat',sep='\t',header='infer')

households=pd.read_table('../SF-CHAMP Outputs/_household_2.dat',sep='\t',header='infer')
households=households[['hhno','hhvehs','hhwkrs','hhincome']]

person=pd.read_table('../SF-CHAMP Outputs/_person_2.dat',sep='\t',header='infer')
person=person[['hhno','pno','pagey','pgend']]



In [3]:
def county(x):
    if x<1191:
        return "San Francisco"
    elif x<1347:
        return "San Mateo"
    elif x<1715:
        return "Santa Clara"
    elif x<2040:
        return "Almeda"
    elif x<2211:
        return "Contra Costa"
    elif x<2291:
        return "Solano"
    elif x<2318:
        return "Napa"
    elif x<2404:
        return 'Sonoma'
    else:
        return "Marin"
    

def trip_type_location(df):
    if df['otaz']<1000 and df['dtaz']<1000:
        return 'int-int'
    elif df['otaz']<1000 and df['dtaz']>1000:
        return 'int-ext'
    elif df['otaz']>1000 and df['dtaz']>1000:
        return 'ext-ext'
    else:
        return 'ext-int'
    
def person_id(df):
    
    if df['trip_type']=='int-int':
        return str(df['hhno'])+'-'+str(df['pno'])
    else:
        return 'ix-'+str(df['hhno'])+'-'+str(df['pno'])
    
def convert_time(x): 
    #ARRTM, DEPTM, ENDTM etc is in minutes after 3 AM
    return time.strftime("%H:%M:%S", time.gmtime(x*60+180))

def trip_origin_purpose(df):
    #(0=home, 1=work, 2=school, 3=escort, 4=personal business (& medical), 
    #5=shopping, 6=meal, 7=social (& recreation), 8=recreation (H version only) 
    #9=medical (H version only), 10=change mode at a park and ride lot
    
    age=df['pagey']
    x=df['opurp']
    
    if x==0:
        return 'home'
    elif x==1:
        return 'work'
    elif x==2 and age<18:
        return 'school'
    elif x==2 and age>18:
        return 'univ'
    elif x==3:
        return 'escort'
    elif x==4:
        return 'othmaint'
    elif x==5:
        return 'shopping'
    elif x==6:
        return "eatout"
    elif x in [7,8]:
        return 'social'
    else:
        return 'othdiscr'
    
def trip_destination_purpose(df):
    #(0=home, 1=work, 2=school, 3=escort, 4=personal business (& medical), 
    #5=shopping, 6=meal, 7=social (& recreation), 8=recreation (H version only) 
    #9=medical (H version only), 10=change mode at a park and ride lot
    
    age=df['pagey']
    x=df['dpurp']
    
    if x==0:
        return 'home'
    elif x==1:
        return 'work'
    elif x==2 and age<18:
        return 'school'
    elif x==2 and age>18:
        return 'univ'
    elif x==3:
        return 'escort'
    elif x==4:
        return 'othmaint'
    elif x==5:
        return 'shopping'
    elif x==6:
        return "eatout"
    elif x in [7,8]:
        return 'social'
    else:
        return 'othdiscr'
    
def tour_purpose(df):
    #(0=home, 1=work, 2=school, 3=escort, 4=personal business (& medical), 
    #5=shopping, 6=meal, 7=social (& recreation), 8=recreation (H version only) 
    #9=medical (H version only), 10=change mode at a park and ride lot
    
    age=df['pagey']
    x=df['pdpurp']
    
    if x==0:
        return 'home'
    elif x==1:
        return 'work'
    elif x==2 and age<18:
        return 'school'
    elif x==2 and age>18:
        return 'univ'
    elif x==3:
        return 'escort'
    elif x==4:
        return 'othmaint'
    elif x==5:
        return 'shopping'
    elif x==6:
        return "eatout"
    elif x in [7,8]:
        return 'social'
    else:
        return 'othdiscr'
    
    
def mode_type(df):
    #Trip main mode type (1=walk, 2=bike, 3=sov, 4=hov 2, 5=hov 3+, 
    #6=walk to transit, 7=park and ride, 8=school bus, 9=TNC, 10=other – survey only)
    if df['mode']==1:
        return 'walk'
    elif df['mode']==2:
        return 'bike'
    elif df['mode']==3:
        return 'car'
    elif df['mode']==4:
        if df['dorp']==1:
            return 'hov2'
        else:
            return 'hov2_teleportation'
    elif df['mode']==5:
        if df['dorp']==1:
            return 'hov3'
        else:
            return 'hov3_teleportation'
#     elif df['mode'] in [4,5]:
#         #return 'car'
#         if df['dorp']==1:
#             return 'car'
#         else:
#             return 'teleportation'
    elif df['mode']==6:
        return 'walk_transit'
    elif df['mode']==7:
        return 'drive_transit'
    elif df['mode']==9:
        return 'ride_hail'
    else:
        return 'other'

    
def exclude_non_car(df):
    if df['trip_type']!='int-int':
        if df['mode']=='car':
            return 1
        else:
            return 0
    else:
        return 1
    
def hh_veh(df):
    if df['hhvehs']==0:
        return 'no_auto'
    elif df['hhvehs']<=df['hhwkrs']:
        return 'auto_deficient'
    else:
        return 'auto_sufficient'
    
    
def data_cleanup(trip,households,person,mzone,tour):
    
    trip['Orig_County']=trip['otaz'].apply(county)
    trip['Dest_County']=trip['dtaz'].apply(county)
    
    sf_orig=trip[(trip['Orig_County']=='San Francisco')|(trip['Dest_County']=='San Francisco')]

    almeda=trip[(trip['Orig_County']=='Almedao')&(trip['Dest_County'].isin(['Marin','San Mateo']))]

    contra=trip[(trip['Orig_County']=='Contra Costa')&(trip['Dest_County'].isin(['San Mateo']))]

    marin=trip[(trip['Orig_County']=='Marin')&(trip['Dest_County'].isin(['Almeda','San Mateo','Santa Clara']))]

    napa=trip[(trip['Orig_County']=='Napa')&(trip['Dest_County'].isin(['San Mateo']))]

    sonoma=trip[(trip['Orig_County']=='Sonoma')&(trip['Dest_County'].isin(['San Mateo','Santa Clara']))]

    sanmateo=trip[(trip['Orig_County']=='San Mateo')&(trip['Dest_County'].isin(['Almeda','Contra Costa','Marin','Solano','Napa','Sonoma']))]

    santaclara=trip[(trip['Orig_County']=='Santa Clara')&(trip['Dest_County'].isin(['Sonoma','Marin']))]

    solano=trip[(trip['Orig_County']=='Solano')&(trip['Dest_County'].isin(['San Mateo']))]


    _trip=pd.concat([sf_orig,almeda,contra,marin,napa,sonoma,sanmateo,santaclara,solano],ignore_index=True)
    
    
    _trip_hh=pd.merge(_trip,households,on=['hhno'],how='left')
    trip_hh_person=pd.merge(_trip_hh,person,on=['hhno','pno'],how='left')
    
    
    trip_hh_person['trip_type']=trip_hh_person.apply(trip_type_location,axis=1)
    trip_hh_person['person_id']=trip_hh_person.apply(person_id,axis=1)

    trip_hh_person['trip_origin_purpose']=trip_hh_person.apply(trip_origin_purpose,axis=1)
    trip_hh_person['trip_dest_purpose']=trip_hh_person.apply(trip_destination_purpose,axis=1)


    trip_hh_person['departure_time']=trip_hh_person['deptm'].apply(convert_time)
    trip_hh_person['arrival_time']=trip_hh_person['arrtm'].apply(convert_time)
    trip_hh_person['end_activity_time']=trip_hh_person['endacttm'].apply(convert_time)
    
    trip_hh_person['mode']=trip_hh_person.apply(mode_type,axis=1)
    
    trip_hh_person['pgend']=np.where(trip_hh_person['pgend']==1,
                                'male','female')
    

    
    trip_hh_person=trip_hh_person[['hhno','pno','person_id','pgend','pagey', 'hhvehs','hhwkrs','hhincome','tour_id','trip_type', 'opcl', 'dpcl',
                 'mode', 'dorp', 'trip_origin_purpose','trip_dest_purpose',
                 'departure_time', 'arrival_time', 'end_activity_time','vot']]
    trip_hh_person['hhsize']=trip_hh_person.groupby('hhno')['pno'].transform('nunique')
    trip_hh_person.sort_values(['person_id','departure_time'],inplace=True)
    
    
    _plans_maz_orig=pd.merge(trip_hh_person,mzone,left_on=['opcl'],right_on=['MAZID'],how='left')

    plans_maz_all=pd.merge(_plans_maz_orig,mzone,left_on=['dpcl'],right_on=['MAZID'],how='left')

    plans_maz_all=plans_maz_all.rename(columns={'X_COORD_x':'X_ORIG',
                                                'Y_COORD_x':'Y_ORIG',
                                                'X_COORD_y':'X_DEST',
                                                'Y_COORD_y':'Y_DEST'})

    plans_maz_all.drop(columns=['MAZID_x','MAZID_y'],inplace=True)
    
    plans_maz_all=plans_maz_all.drop_duplicates()
    
    plans_maz_all['Exclude_Mode']=plans_maz_all.apply(exclude_non_car,axis=1)
    
    plans_maz_all=plans_maz_all[plans_maz_all['Exclude_Mode']==1]
    
    tour=tour[['pdpurp','id']]
    tour=tour.rename(columns={'id':'tour_id'})
    

    plans_df_attribs=pd.merge(plans_maz_all,tour,on=['tour_id'],how='left')
    plans_df_attribs['tour_purpose']=plans_df_attribs.apply(tour_purpose,axis=1)
    
    plans_df_attribs['vehicles']=plans_df_attribs.apply(hh_veh,axis=1)
    
    plans_df_attribs['hhincome']=plans_df_attribs['hhincome'].astype(str)
    plans_df_attribs['vot']=plans_df_attribs['vot']*60/100 #vot is in cents per minute, converting to dollars per hour
    plans_df_attribs['vot']=plans_df_attribs['vot'].astype(str)
    plans_df_attribs['hhsize']=plans_df_attribs['hhsize'].astype(str)
    
    plans_df_attribs['pagey']=plans_df_attribs['pagey'].astype(str)
    
    plans_df_attribs=plans_df_attribs.sort_values(by=['hhno','pno','departure_time'])
    
    return (plans_df_attribs)


In [4]:
%%time
plans_df=data_cleanup(trip,households,person,mzone,tour)

Wall time: 19min 40s


In [5]:
null_modes=pd.read_csv('../../../beam/output_analysis/Temp/null_modes.csv')

In [6]:
test_plans=plans_df[plans_df['person_id'].isin(null_modes.personId.unique())]

In [7]:
test_plans['hhsize']=test_plans['hhsize'].astype(float)
test_plans['hhsize']=test_plans['hhsize'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_plans['hhsize']=test_plans['hhsize'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_plans['hhsize']=test_plans['hhsize'].astype(str)


In [8]:
plans_df['hhsize']=plans_df['hhsize'].astype(float)
plans_df['hhsize']=plans_df['hhsize'].astype(str)

In [18]:
plans_attrib=plans_df[['hhno','person_id','pgend','pagey','hhincome','vehicles','vot','hhvehs']].drop_duplicates()
plans_attrib.head()

Unnamed: 0,hhno,person_id,pgend,pagey,hhincome,vehicles,vot,hhvehs
0,1,1-1,female,55,942,no_auto,1.412549409243468,0
1801025,3,3-1,male,47,8728,no_auto,2.447413071297918,0
1801027,3,3-1,male,47,8728,no_auto,4.273965789216084,0
2466334,4,4-1,female,71,5052,no_auto,1.122798580973298,0
2466336,4,4-1,female,71,5052,no_auto,1.220280580586952,0


In [19]:
test_plans.head()

Unnamed: 0,hhno,pno,person_id,pgend,pagey,hhvehs,hhwkrs,hhincome,tour_id,trip_type,opcl,dpcl,mode,dorp,trip_origin_purpose,trip_dest_purpose,departure_time,arrival_time,end_activity_time,vot,hhsize,X_ORIG,Y_ORIG,X_DEST,Y_DEST,Exclude_Mode,pdpurp,tour_purpose,vehicles,VOT
2764926,63,3,63-3,female,30,3,2,38016,1871,int-int,720,2295,hov3_teleportation,2,home,escort,07:52:00,07:59:00,08:01:00,11.22018667074774,8.0,551662.441744,4173739.0,548946.307247,4174812.0,1,3,escort,auto_sufficient,7.810582
2764927,63,3,63-3,female,30,3,2,38016,1871,int-int,2295,720,hov3,1,escort,home,08:01:00,08:08:00,09:23:00,11.22018667074774,8.0,548946.307247,4174812.0,551662.441744,4173739.0,1,3,escort,auto_sufficient,7.810582
2764928,63,3,63-3,female,30,3,2,38016,1872,int-int,720,3682,hov2_teleportation,2,home,escort,09:23:00,09:28:00,09:29:00,6.65919287466888,8.0,551662.441744,4173739.0,549880.858699,4174499.0,1,3,escort,auto_sufficient,7.810582
2764929,63,3,63-3,female,30,3,2,38016,1872,int-int,3682,720,hov3,1,escort,home,09:29:00,09:34:00,12:20:00,8.8434081375603,8.0,549880.858699,4174499.0,551662.441744,4173739.0,1,3,escort,auto_sufficient,7.810582
2764930,63,3,63-3,female,30,3,2,38016,1874,int-int,720,105,hov3_teleportation,2,home,shopping,12:20:00,12:40:00,12:45:00,8.22985598609322,8.0,551662.441744,4173739.0,553319.370625,4183077.0,1,5,shopping,auto_sufficient,7.810582


In [10]:
test_plans['vot']=test_plans['vot'].astype(float)


test_plans['VOT']=test_plans.groupby(['person_id'])['vot'].transform('mean')

test_plans['vot']=test_plans['vot'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_plans['vot']=test_plans['vot'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_plans['VOT']=test_plans.groupby(['person_id'])['vot'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_plans['vot']=test_plans['vot'].astype(str)


In [9]:
plans_attrib.head()

Unnamed: 0,hhno,person_id,pgend,pagey,hhincome,vehicles,vot,VOT
0,1,1-1,female,55,942,no_auto,2.354249,2.354249
1,1,1-1,female,55,942,no_auto,2.354249,2.354249
1801025,3,3-1,male,47,8728,no_auto,4.079022,5.601149
1801026,3,3-1,male,47,8728,no_auto,4.079022,5.601149
1801027,3,3-1,male,47,8728,no_auto,7.123276,5.601149


In [11]:
test_plans[['hhno','person_id','pgend','pagey','hhincome','vehicles','VOT']].drop_duplicates(['person_id']).to_csv(
'../SF_CHAMP_Converted/agent_attribs.csv.gz',index=False,compression='gzip')

In [12]:
test_plans['hhno']=test_plans['hhno'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_plans['hhno']=test_plans['hhno'].astype(str)


In [13]:

    
plans_df['hhno']=plans_df['hhno'].astype(str)

In [14]:
def create_xml(plans_dataframe,size):
    
    population_tag = ET.Element('population')
    
    for i in range(0,len(plans_dataframe.person_id.unique())+1,size):
        
        j=i+size

        test_df=plans_dataframe[plans_dataframe['person_id'].isin(plans_dataframe.person_id.unique()[i:j])]

        #parser = ET.XMLParser(remove_blank_text=True)

        #template = ET.parse('../SF_CHAMP_Converted/sf-all-trips.xml',parser)
        
#         population_tag = ET.Element('population')

        for i in test_df['person_id'].unique():
            
            df=test_df[test_df['person_id']==i]

            person_id=str(i)
            

            person_tag = ET.SubElement(population_tag, 'person') 
            person_tag.set('id',person_id)
            
            attributes_tag = ET.SubElement(person_tag,'attributes')
            
            age_tag = ET.SubElement(attributes_tag, 'attribute')
            age_tag.set('name','age')
            age_tag.set('class','java.lang.Integer')
            age_tag.text=df.values[0,df.columns.get_loc('pagey')]

            sex_tag = ET.SubElement(attributes_tag, 'attribute')
            sex_tag.set('name','sex')
            sex_tag.set('class','java.lang.String')
            sex_tag.text=df.values[0,df.columns.get_loc('pgend')]

            hh_tag = ET.SubElement(attributes_tag, 'attribute')
            hh_tag.set('name','household_id')
            hh_tag.set('class','java.lang.String')
            hh_tag.text=df.values[0,df.columns.get_loc('hhno')]
            
            veh_tag = ET.SubElement(attributes_tag, 'attribute')
            veh_tag.set('name','autoWorkRatio')
            veh_tag.set('class','java.lang.String')
            veh_tag.text=df.values[0,df.columns.get_loc('vehicles')]
            
            income_tag = ET.SubElement(attributes_tag, 'attribute')
            income_tag.set('name','hh_income')
            income_tag.set('class','java.lang.Double')
            income_tag.text=df.values[0,df.columns.get_loc('hhincome')]
            
            hhsize_tag = ET.SubElement(attributes_tag, 'attribute')
            hhsize_tag.set('name','hhSize')
            hhsize_tag.set('class','java.lang.Double')
            hhsize_tag.text=df.values[0,df.columns.get_loc('hhsize')]
            
            vot_tag = ET.SubElement(attributes_tag, 'attribute')
            vot_tag.set('name','vot')
            vot_tag.set('class','java.lang.Double')
            vot_tag.text=df.values[0,df.columns.get_loc('vot')]
            

            plan_tag = ET.SubElement(person_tag, 'plan') 
            plan_tag.set('selected', 'yes')

            

            act_tag = ET.SubElement(plan_tag, 'activity') 
            act_tag.set('type',df.iloc[0,df.columns.get_loc('trip_origin_purpose')])
            act_tag.set('end_time',df.iloc[0,df.columns.get_loc('departure_time')])#
            act_tag.set('y',str(df.iloc[0,df.columns.get_loc('Y_ORIG')])) 
            act_tag.set('x',str(df.iloc[0,df.columns.get_loc('X_ORIG')]))
            
            
            act_attr_tag=ET.SubElement(act_tag, 'attributes')
            
            tour_purp_tag=ET.SubElement(act_attr_tag, 'attribute')
            tour_purp_tag.set('name','primary_purpose')
            tour_purp_tag.set('class','java.lang.String') 
            tour_purp_tag.text=df.values[0,df.columns.get_loc('tour_purpose')]
            
            vot_tag = ET.SubElement(act_attr_tag, 'attribute')
            vot_tag.set('name','vot')
            vot_tag.set('class','java.lang.Double')
            vot_tag.text=df.values[0,df.columns.get_loc('vot')]
            
#             leg_tag = ET.SubElement(plan_tag, 'leg')
#             leg_tag.set('mode',df.values[0,df.columns.get_loc('mode')])

            for x in range(0,len(df.values)):

                leg_tag = ET.SubElement(plan_tag, 'leg')
                leg_tag.set('mode',df.values[x,df.columns.get_loc('mode')])

                act_tag = ET.SubElement(plan_tag, 'activity')
                act_tag.set('type',df.values[x,df.columns.get_loc('trip_dest_purpose')])
                act_tag.set('end_time',df.values[x,df.columns.get_loc('end_activity_time')])
                act_tag.set('y',str(df.values[x,df.columns.get_loc('Y_DEST')]))
                act_tag.set('x',str(df.values[x,df.columns.get_loc('X_DEST')]))
                

                act_attr_tag=ET.SubElement(act_tag, 'attributes')
                
                tour_purp_tag=ET.SubElement(act_attr_tag, 'attribute')
                tour_purp_tag.set('name','primary_purpose')
                tour_purp_tag.set('class','java.lang.String') 
                tour_purp_tag.text=df.values[x,df.columns.get_loc('tour_purpose')]
                
                vot_tag = ET.SubElement(act_attr_tag, 'attribute')
                vot_tag.set('name','vot')
                vot_tag.set('class','java.lang.Double')
                vot_tag.text=df.values[x,df.columns.get_loc('vot')]
                
                

            population_tag.append(person_tag)


        tree=ET.ElementTree(population_tag)
        tree.write('../SF_CHAMP_Converted/sf-trips-null_modes.xml', pretty_print=True, xml_declaration=True,
                            doctype='<!DOCTYPE population SYSTEM "http://www.matsim.org/files/dtd/population_v6.dtd">',   
                   encoding="utf-8")

    


In [20]:
%%time
create_xml(test_plans,12000)

Wall time: 2min 7s


In [16]:
%%time
create_xml(plans_df,12000)

Wall time: 4h 19min 55s
