# Semantic location feature extraction

This notebook extracts communication features in the context of semantic locations. Some processing functions pulled from Sohrab's [data analysis repository](https://github.com/sosata/CS120DataAnalysis).

Possible semantic locations:

Home, Work, Another's Home, Arts & Entertainment (Theater, Music Venue, Etc.), Food (Restaurant, Cafe), Nightlife Spot (Bar, Club), Outdoors & Recreation, Gym or Other Exercise, Professional or Medical Office, Spiritual (Church, Temple, Etc.), Shop or Store, Travel or Transport (Airport, Bus Stop, Train Station, Etc.), Vehicle, Other (Not Listed)

In [34]:
import csv
import os
import numpy as np
from df_utils import *
from sys import exit
import shutil

import pandas as pd

In [6]:
#probes = ['act','app','aud','bat','cal','coe','fus','lgt','run','scr','tch','wif','wtr']

# communication-related features
probes = ['act', 'cal', 'coe', 'fus']
coe_cols =  ["timestamp", "contact_name", "contact_number", "comm_type", "comm_direction"]

loc_coe_df = pd.DataFrame()
data_dir = '../CS120/CS120-sensor-csvs/'
weather_data_dir = '/data/CS120Weather/'
out_dir = 'processed_data/'

subjects = os.listdir(data_dir)
#subjects = ['1114936'] # temporary for testing

for subj in subjects:
    filename = data_dir + subj + '/eml.csv'
    if os.path.exists(filename):
        print(filename)
        loc = []
        lat_report = []
        lng_report = []
        t_report = []
        with open(filename) as file_in:
            data = csv.reader(file_in, delimiter='\t')
            eml = []
            for data_row in data:
                if data_row:
                    # reading location category (state)
                    loc_string = data_row[6]
                    loc_string = loc_string[1:len(loc_string)-1]
                    loc_string.split(',')
                    loc.append(loc_string)
                    
                    # reading lat. and long.
                    lat_report.append(float(data_row[2]))
                    lng_report.append(float(data_row[3]))
                    t_report.append(float(data_row[0]))
                    
                    # adding to eml
                    eml.append(data_row)
                    
        file_in.close()
    else:
        print('skipping subject '+subj+' without location report/foursquare data.')
        continue
        
                      
    # looking into data between current and previous report
    filename = data_dir + subj + '/fus.csv'
    if os.path.exists(filename):
        with open(filename) as file_in:
            data_gps = csv.reader(file_in, delimiter='\t')
            t_gps = []
            lat_gps = []
            lng_gps = []
            for row_gps in data_gps:
                if row_gps:
                    t_gps.append(float(row_gps[0]))
                    lat_gps.append(float(row_gps[1]))
                    lng_gps.append(float(row_gps[2]))
        file_in.close()
    else:
        print('skipping subject '+subj+' without location data.')
        continue

    if os.path.exists(out_dir+subj):
       shutil.rmtree(out_dir+subj)
       os.makedirs(out_dir+subj)
    else:
       os.makedirs(out_dir+subj)
    
    t_prev = 0


    for (i,eml_row) in enumerate(eml):

        # finding t_start and t_end from gps data
        t_start, t_end = get_time_from_gps(data_dir+subj, t_report[i], t_prev, lat_report[i], lng_report[i])

        # if there is any clusters found, extract sensor data and put in a separate file
        if len(t_start)>0:
            data = get_data_at_location(data_dir+subj, t_start, t_end, 'coe')
            if len(data)>0:
                df = pd.DataFrame(data, columns=coe_cols)
                df['pid'] = subj
                df['location'] = preprocess_location(eml_row[6]) # location label
                loc_coe_df = loc_coe_df.append(df)
        else:
            print('instance '+str(i)+' skipped')
            
        # continue iteration
        if i<len(t_report)-1:
            if t_report[i]!=t_report[i+1]:
                t_prev = t_report[i]
#         # creating a dir and writing the eml row 
# #         loc_dir = out_dir+subj+'/'+str(i)
# #         if not os.path.exists(loc_dir):
# #            os.makedirs(loc_dir)
# #         with open(loc_dir+'/'+'eml.csv','w') as f:
# #            fwriter = csv.writer(f, delimiter='\t', quotechar='|',quoting=csv.QUOTE_MINIMAL)
# #            fwriter.writerow(eml_row)
# #         f.close()
        
#         # if there is any clusters found, extract sensor data and put in a separate file
#         if len(t_start)>0:
#             for probe in probes:
#                 if probe=='wtr':
#                     data = get_data_at_location(weather_data_dir+subj, t_start, t_end, probe)
#                 else:
#                     data = get_data_at_location(data_dir+subj, t_start, t_end, probe)
#                 if len(data)>0:
#                     with open(loc_dir+'/'+probe+'.csv', 'w') as f:
#                         fwriter = csv.writer(f, delimiter='\t', quotechar='|',quoting=csv.QUOTE_MINIMAL)
#                         for (j,d) in enumerate(data):
#                             fwriter.writerow(d)
#                     f.close()
#         else:
#             print('instance '+str(i)+' skipped')

#         if i<len(t_report)-1:
#             if t_report[i]!=t_report[i+1]:
#                 t_prev = t_report[i]

../CS120/CS120-sensor-csvs/1002060/eml.csv
../CS120/CS120-sensor-csvs/1013558/eml.csv
no data - instance skipped
instance 0 skipped
no data - instance skipped
instance 1 skipped
no data - instance skipped
instance 2 skipped
no data - instance skipped
instance 3 skipped
no data - instance skipped
instance 4 skipped
no data - instance skipped
instance 21 skipped
no data - instance skipped
instance 23 skipped
no data - instance skipped
instance 30 skipped
no data - instance skipped
instance 48 skipped
no data - instance skipped
instance 50 skipped
no data - instance skipped
instance 51 skipped
no data - instance skipped
instance 53 skipped
no data - instance skipped
instance 60 skipped
no data - instance skipped
instance 104 skipped
no data - instance skipped
instance 123 skipped
no data - instance skipped
instance 126 skipped
../CS120/CS120-sensor-csvs/1022235/eml.csv
../CS120/CS120-sensor-csvs/1027472/eml.csv
no data - instance skipped
instance 13 skipped
no data - instance skipped
inst

no data - instance skipped
instance 27 skipped
no data - instance skipped
instance 28 skipped
../CS120/CS120-sensor-csvs/1183252/eml.csv
no data - instance skipped
instance 6 skipped
../CS120/CS120-sensor-csvs/1184498/eml.csv
no data - instance skipped
instance 11 skipped
../CS120/CS120-sensor-csvs/1186533/eml.csv
no data - instance skipped
instance 94 skipped
no data - instance skipped
instance 95 skipped
../CS120/CS120-sensor-csvs/1189725/eml.csv
no data - instance skipped
instance 5 skipped
no data - instance skipped
instance 78 skipped
../CS120/CS120-sensor-csvs/1197009/eml.csv
../CS120/CS120-sensor-csvs/1199841/eml.csv
no data - instance skipped
instance 10 skipped
no data - instance skipped
instance 11 skipped
no data - instance skipped
instance 12 skipped
no data - instance skipped
instance 13 skipped
../CS120/CS120-sensor-csvs/1203725/eml.csv
no data - instance skipped
instance 75 skipped
no data - instance skipped
instance 79 skipped
../CS120/CS120-sensor-csvs/1207041/eml.csv


no data - instance skipped
instance 6 skipped
no data - instance skipped
instance 7 skipped
no data - instance skipped
instance 8 skipped
no data - instance skipped
instance 9 skipped
no data - instance skipped
instance 10 skipped
no data - instance skipped
instance 11 skipped
no data - instance skipped
instance 12 skipped
no data - instance skipped
instance 13 skipped
no data - instance skipped
instance 14 skipped
no data - instance skipped
instance 15 skipped
no data - instance skipped
instance 16 skipped
no data - instance skipped
instance 17 skipped
no data - instance skipped
instance 18 skipped
no data - instance skipped
instance 19 skipped
no data - instance skipped
instance 20 skipped
no data - instance skipped
instance 21 skipped
no data - instance skipped
instance 22 skipped
no data - instance skipped
instance 23 skipped
no data - instance skipped
instance 24 skipped
no data - instance skipped
instance 25 skipped
no data - instance skipped
instance 26 skipped
no data - instanc

../CS120/CS120-sensor-csvs/1428949/eml.csv
../CS120/CS120-sensor-csvs/1433908/eml.csv
no data - instance skipped
instance 12 skipped
../CS120/CS120-sensor-csvs/1439160/eml.csv
../CS120/CS120-sensor-csvs/1444502/eml.csv
../CS120/CS120-sensor-csvs/1464458/eml.csv
../CS120/CS120-sensor-csvs/1479464/eml.csv
no data - instance skipped
instance 0 skipped
no data - instance skipped
instance 27 skipped
no data - instance skipped
instance 30 skipped
no data - instance skipped
instance 38 skipped
no data - instance skipped
instance 41 skipped
no data - instance skipped
instance 119 skipped
../CS120/CS120-sensor-csvs/1483186/eml.csv
../CS120/CS120-sensor-csvs/1495049/eml.csv
no data - instance skipped
instance 1 skipped
no data - instance skipped
instance 9 skipped
no data - instance skipped
instance 11 skipped
../CS120/CS120-sensor-csvs/1495360/eml.csv
skipping subject 1496251 without location report/foursquare data.
../CS120/CS120-sensor-csvs/1497026/eml.csv
no data - instance skipped
instance 

no data - instance skipped
instance 211 skipped
no data - instance skipped
instance 212 skipped
no data - instance skipped
instance 234 skipped
no data - instance skipped
instance 247 skipped
../CS120/CS120-sensor-csvs/25349/eml.csv
no data - instance skipped
instance 110 skipped
no data - instance skipped
instance 112 skipped
../CS120/CS120-sensor-csvs/345921/eml.csv
no data - instance skipped
instance 72 skipped
no data - instance skipped
instance 75 skipped
no data - instance skipped
instance 86 skipped
no data - instance skipped
instance 87 skipped
no data - instance skipped
instance 89 skipped
no data - instance skipped
instance 97 skipped
../CS120/CS120-sensor-csvs/38880/eml.csv
no data - instance skipped
instance 17 skipped
no data - instance skipped
instance 18 skipped
no data - instance skipped
instance 19 skipped
no data - instance skipped
instance 20 skipped
no data - instance skipped
instance 21 skipped
no data - instance skipped
instance 40 skipped
no data - instance skipp

no data - instance skipped
instance 602 skipped
no data - instance skipped
instance 617 skipped
no data - instance skipped
instance 624 skipped
no data - instance skipped
instance 627 skipped
no data - instance skipped
instance 650 skipped
no data - instance skipped
instance 666 skipped
no data - instance skipped
instance 718 skipped
../CS120/CS120-sensor-csvs/964685/eml.csv
no data - instance skipped
instance 18 skipped
no data - instance skipped
instance 52 skipped
../CS120/CS120-sensor-csvs/97397/eml.csv
../CS120/CS120-sensor-csvs/98384/eml.csv
no data - instance skipped
instance 41 skipped
no data - instance skipped
instance 73 skipped
no data - instance skipped
instance 81 skipped
no data - instance skipped
instance 82 skipped
no data - instance skipped
instance 99 skipped
../CS120/CS120-sensor-csvs/984221/eml.csv
no data - instance skipped
instance 124 skipped
no data - instance skipped
instance 127 skipped
../CS120/CS120-sensor-csvs/AC363GY/eml.csv
no data - instance skipped
ins

no data - instance skipped
instance 85 skipped
no data - instance skipped
instance 88 skipped
no data - instance skipped
instance 94 skipped
no data - instance skipped
instance 107 skipped
no data - instance skipped
instance 118 skipped
no data - instance skipped
instance 119 skipped
no data - instance skipped
instance 124 skipped
no data - instance skipped
instance 126 skipped
no data - instance skipped
instance 127 skipped
../CS120/CS120-sensor-csvs/QG620BT/eml.csv
no data - instance skipped
instance 54 skipped
no data - instance skipped
instance 59 skipped
no data - instance skipped
instance 72 skipped
no data - instance skipped
instance 73 skipped
no data - instance skipped
instance 74 skipped
no data - instance skipped
instance 75 skipped
no data - instance skipped
instance 76 skipped
no data - instance skipped
instance 77 skipped
../CS120/CS120-sensor-csvs/SP157RF/eml.csv
no data - instance skipped
instance 1 skipped
no data - instance skipped
instance 2 skipped
../CS120/CS120-se

In [8]:
#import pickle
#pickle.dump(loc_coe_df, open('loc_coe.df', 'wb'), -1)

In [18]:
def extract_loc_coe_data(data_dir, subj):
    """Extracts semantic location data for given communications.
    
    """
    loc_coe_df = pd.DataFrame()
    filename = data_dir + subj + '/eml.csv'
    if os.path.exists(filename):
        print(filename)
        loc = []
        lat_report = []
        lng_report = []
        t_report = []
        with open(filename) as file_in:
            data = csv.reader(file_in, delimiter='\t')
            eml = []
            for data_row in data:
                if data_row:
                    # reading location category (state)
                    loc_string = data_row[6]
                    loc_string = loc_string[1:len(loc_string)-1]
                    loc_string.split(',')
                    loc.append(loc_string)
                    
                    # reading lat. and long.
                    lat_report.append(float(data_row[2]))
                    lng_report.append(float(data_row[3]))
                    t_report.append(float(data_row[0]))
                    
                    # adding to eml
                    eml.append(data_row)
                    
        file_in.close()
    else:
        print('skipping subject '+subj+' without location report/foursquare data.')
        return
        
                      
    # looking into data between current and previous report
    filename = data_dir + subj + '/fus.csv'
    if os.path.exists(filename):
        with open(filename) as file_in:
            data_gps = csv.reader(file_in, delimiter='\t')
            t_gps = []
            lat_gps = []
            lng_gps = []
            for row_gps in data_gps:
                if row_gps:
                    t_gps.append(float(row_gps[0]))
                    lat_gps.append(float(row_gps[1]))
                    lng_gps.append(float(row_gps[2]))
        file_in.close()
    else:
        print('skipping subject '+subj+' without location data.')
        return

    if os.path.exists(out_dir+subj):
       shutil.rmtree(out_dir+subj)
       os.makedirs(out_dir+subj)
    else:
       os.makedirs(out_dir+subj)
    
    t_prev = 0


    for (i,eml_row) in enumerate(eml):

        # finding t_start and t_end from gps data
        t_start, t_end = get_time_from_gps(data_dir+subj, t_report[i], t_prev, lat_report[i], lng_report[i])

        # if there is any clusters found, extract sensor data and put in a separate file
        if len(t_start)>0:
            data = get_data_at_location(data_dir+subj, t_start, t_end, 'coe')
            if len(data)>0:
                df = pd.DataFrame(data, columns=coe_cols)
                df['pid'] = subj
                df['location'] = eml_row[6] # location label(s)
                df['visit_reason'] = eml_row[7] # semantic location visit reason

                loc_coe_df = loc_coe_df.append(df)
        else:
            print('instance '+str(i)+' skipped')
            
        # continue iteration
        if i<len(t_report)-1:
            if t_report[i]!=t_report[i+1]:
                t_prev = t_report[i]
                
    return loc_coe_df

In [14]:
loc_coe_df_old = loc_coe_df.copy()

In [23]:
# testing
import time
t = time.time()
for subject in subjects[:4]:
    test_df = extract_loc_coe_data(data_dir, subject)
print(time.time() - t)

../CS120/CS120-sensor-csvs/1002060/eml.csv
../CS120/CS120-sensor-csvs/1013558/eml.csv
no data - instance skipped
instance 0 skipped
no data - instance skipped
instance 1 skipped
no data - instance skipped
instance 2 skipped
no data - instance skipped
instance 3 skipped
no data - instance skipped
instance 4 skipped
no data - instance skipped
instance 21 skipped
no data - instance skipped
instance 23 skipped
no data - instance skipped
instance 30 skipped
no data - instance skipped
instance 48 skipped
no data - instance skipped
instance 50 skipped
no data - instance skipped
instance 51 skipped
no data - instance skipped
instance 53 skipped
no data - instance skipped
instance 60 skipped
no data - instance skipped
instance 104 skipped
no data - instance skipped
instance 123 skipped
no data - instance skipped
instance 126 skipped
../CS120/CS120-sensor-csvs/1022235/eml.csv
../CS120/CS120-sensor-csvs/1027472/eml.csv
no data - instance skipped
instance 13 skipped
no data - instance skipped
inst

## Location feature extraction

In [162]:
eml_final_df = pickle.load(open('../data/eml_coe_data.df', 'rb'))
top5 = pickle.load(open('../data/top_5_contacts_full.df', 'rb'))
top10 = pickle.load(open('../data/top_10_contacts_full.df', 'rb'))

In [163]:
eml_final_df['timestamp'] = eml_final_df['timestamp'].astype(float)
#top5['timestamp'] = top5['timestamp'].astype(str)
#eml_final_df = eml_final_df.drop_duplicates(subset=['pid', 'contact_name', 'timestamp'])
eml_final_df = eml_final_df.drop_duplicates(subset=['pid', 'timestamp'])

In [164]:
#merge_cols = ['pid', 'contact_name', 'timestamp', 'location', 'visit_reason']
merge_cols = ['pid', 'timestamp', 'location', 'visit_reason']

top5_merged_coe = top5.merge(eml_final_df[merge_cols], on=['pid', 'timestamp'], how='left')
top10_merged_coe = top10.merge(eml_final_df[merge_cols], on=['pid', 'timestamp'], how='left')

In [165]:
print(top5.shape)
print(eml_final_df.shape)
print(top5_merged_coe.shape)

print(top10.shape)
print(eml_final_df.shape)
print(top10_merged_coe.shape)

(296654, 14)
(301618, 8)
(296654, 16)
(337885, 14)
(301618, 8)
(337885, 16)


In [166]:
both_pids = list(set(eml_final_df['pid']).intersection(set(top5['pid'])))
len(both_pids)

189

In [169]:
top5_merged_coe = top5_merged_coe.loc[top5_merged_coe['pid'].isin(both_pids)]
top10_merged_coe = top10_merged_coe.loc[top10_merged_coe['pid'].isin(both_pids)]

In [174]:
print(top5_merged_coe['location'].isna().sum())
print(top10_merged_coe['location'].isna().sum())

153664
177044


In [178]:
# pickle.dump(top5_merged_coe, open('../data/top_5_contacts_loc.df', 'wb'), -1)
# pickle.dump(top10_merged_coe, open('../data/top_10_contacts_loc.df', 'wb'), -1)

### Feature contruction

In [200]:
top5_merged_coe['location']

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
5            NaN
6            NaN
7            NaN
8            NaN
9            NaN
10           NaN
11           NaN
12           NaN
13           NaN
14           NaN
15           NaN
16           NaN
17           NaN
18           NaN
19           NaN
20           NaN
21           NaN
22           NaN
23           NaN
24           NaN
25           NaN
26           NaN
27           NaN
28           NaN
29           NaN
           ...  
296624       NaN
296625    [Home]
296626       NaN
296627       NaN
296628       NaN
296629       NaN
296630       NaN
296631       NaN
296632       NaN
296633       NaN
296634       NaN
296635       NaN
296636       NaN
296637       NaN
296638       NaN
296639       NaN
296640       NaN
296641       NaN
296642       NaN
296643       NaN
296644       NaN
296645       NaN
296646       NaN
296647       NaN
296648       NaN
296649       NaN
296650       NaN
296651       N

In [201]:
# clean columns
top5_merged_coe['visit_reason'] = top5_merged_coe['visit_reason'].map(lambda x: eval(x), na_action='ignore')
top5_merged_coe['location'] = top5_merged_coe['location'].map(lambda x: eval(x), na_action='ignore')


TypeError: eval() arg 1 must be a string, bytes or code object

In [211]:
locs = set()
visit_reasons = set()
for i, row in top5_merged_coe.iterrows():
    if type(row['location']) is list:
        locs.update(row['location'])
    if type(row['visit_reason']) is list:
        visit_reasons.update(row['visit_reason'])

In [212]:
locs

{'',
 ' craft fair ',
 '816 HOTE',
 '?',
 'Adelanto School',
 'Adelanto community resource center',
 "Another's Home",
 "Another's Work",
 'Apt ',
 'Arts & Entertainment (Theater, Music Venue, Etc.)',
 'Aurora civic center ',
 'Bank',
 'Bank of America',
 'Bath & Body Works',
 'Bbva Compass Bank',
 'Brandon Smith work',
 'Burger King ',
 'Car Dealership',
 'Chester county ',
 'Chestnut hill hospital ',
 "Child's school",
 'College',
 'Community College ',
 'Convalescent Home',
 "Dad's home",
 'Dance studio',
 'Daycare ',
 'Dental surgery',
 'Dog walk',
 'Dollar General ',
 "Don't know",
 'Dr appt',
 'Elementary School',
 'Fire department ',
 'Flea market',
 'Foley Middle School',
 'Food (Restaurant, Cafe)',
 'Game Stop',
 'Gas Station',
 'Gateway Church, North Fort Worth',
 'Gaylor Electric',
 'Gaylord Palms',
 'Greenville county courthouse',
 'Gym or Other Exercise',
 'Harambee main office',
 'High School',
 'Home',
 'Home Goods',
 'Hospital',
 'Hospital ',
 'Hotel',
 'I am at home an

In [213]:
visit_reasons

{'',
 '?',
 "Another's Work",
 'Appointment with Attorney',
 'Apt ',
 'Aurora civic center ',
 'Babysitting',
 'Band Practice',
 'Bible study',
 'Birthday shopping ',
 'Car Dealership',
 "Child's school",
 'Christmas',
 'Christmas Shopping',
 'Christmas parade',
 'Christmas shopping',
 'Church',
 'Church Service, Gateway Church',
 'Community College ',
 'Convalescent Home',
 "Dad's home",
 'Dance studio',
 'Daycare ',
 'Dental Office',
 'Did not visit',
 'Dining',
 'Doctor',
 'Doctor ',
 'Doctor visit',
 'Doctor visits',
 "Doctor's Visit",
 "Doctor's appointment",
 'Doctor/medical',
 'Doctors',
 'Dog walk',
 "Don't know",
 'Dr appointment ',
 'Dr. Appointments ',
 'Dr. Appt.',
 'Elementary School',
 'Entertainment',
 'Errand',
 'Exercise',
 'Foley Middle School',
 'Gas Station',
 'Gocery Shopping',
 'Groceries',
 'Holiday Celebration ',
 'Holiday Shopping',
 'Home',
 'Home Goods',
 'Hospital',
 'Hotel',
 'Hunting',
 "Husband's surgery",
 'I am at home and not at another location',
 "I 

In [214]:
canonical_visit_reasons = ['Entertainment',
                           'Errand',
                           'Home',
                           'Work',
                           'Exercise',
                           'Dining',
                           'Socialize',
                           'Travelling / Traffic'
                          ]

In [217]:
top5_merged_coe.apply(lambda x: pd.Series(x['location']), result_type='expand', axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
5,,,,,,,,,
6,,,,,,,,,
7,,,,,,,,,
8,,,,,,,,,
9,,,,,,,,,


In [226]:
canonical_locs = ["Home", 
                  "Work", 
                  "Another's Home", 
                  "Arts & Entertainment (Theater, Music Venue, Etc.)",
                  "Food (Restaurant, Cafe)",
                  'Nightlife Spot (Bar, Club)',
                  'Outdoors & Recreation',
                  'Gym or Other Exercise',
                  'Professional or Medical Office',
                  'Spiritual (Church, Temple, Etc.)',
                  'Shop or Store',
                  'Travel or Transport (Airport, Bus Stop, Train Station, Etc.)',
                  'Vehicle'
                 ]

short_name_locs = ["loc:home",
                   "loc:work",
                   "loc:anothers_home",
                   "loc:arts/entertainment",
                   "loc:food",
                   "loc:nightlife",
                   "loc:outdoors/recreation",
                   "loc:gym/exercise",
                   "loc:professional/medical_office",
                   "loc:spiritual",
                   "loc:shop",
                   "loc:travel/transport",
                   "loc:vehicle",
                   "loc:other"
                ]

col_dict = {k:0 for k in short_name_locs}


def map_locations(locations):
    """Takes the locations array as input and expands into a Series.
    
    """
    col_dict = {k:0 for k in short_name_locs}
    
    if type(locations) is float:
        return pd.Series(col_dict)
    
    for loc in locations:
        if loc in canonical_locs:
            col_dict[short_name_locs[canonical_locs.index(loc)]] = 1
        else:
            col_dict['other'] = 1
            
    return pd.Series(col_dict)
            

In [227]:
location_df = top5_merged_coe['location'].apply(map_locations)

In [237]:
canonical_visit_reasons = ['Entertainment',
                           'Errand',
                           'Home',
                           'Work',
                           'Exercise',
                           'Dining',
                           'Socialize',
                           'Travelling / Traffic'
                          ]

short_name_visit_reasons = ['visit_reason:entertainment',
                            'visit_reason:errand',
                            'visit_reason:home',
                            'visit_reason:work',
                            'visit_reason:exercise',
                            'visit_reason:dining',
                            'visit_reason:socialize',
                            'visit_reason:travel/traffic',
                            'visit_reason:other'
                           ]

def map_visit_reasons(visit_reasons):
    """Takes the visit_reasons array as input and expands into a Series.
    
    """
    col_dict = {k:0 for k in short_name_visit_reasons}
    
    if type(visit_reasons) is float:
        return pd.Series(col_dict)
    
    for visit in visit_reasons:
        if visit in canonical_visit_reasons:
            col_dict[short_name_visit_reasons[canonical_visit_reasons.index(visit)]] = 1
        else:
            col_dict['visit_reason:other'] = 1
            
    return pd.Series(col_dict)
            

In [238]:
visit_reason_df = top5_merged_coe['visit_reason'].apply(map_visit_reasons)

In [242]:
top5_final_df = pd.concat([top5_merged_coe, location_df, visit_reason_df], axis=1)

In [243]:
top10_loc_df = top10_merged_coe['location'].apply(map_locations)
top10_visit_df = top10_merged_coe['visit_reason'].apply(map_visit_reasons)

In [244]:
top10_final_df = pd.concat([top10_merged_coe, top10_loc_df, top10_visit_df], axis=1)

In [245]:
pickle.dump(top5_final_df, open('../data/top_5_contacts_loc_final.df', 'wb'), -1)
pickle.dump(top10_final_df, open('../data/top_10_contacts_loc_final.df', 'wb'), -1)

### Final features

In [247]:
call_df = top5_final_df.loc[top5_final_df['comm_type'] == 'PHONE']
sms_df = top5_final_df.loc[top5_final_df['comm_type'] == 'SMS']

In [256]:
call_visit = call_df.groupby(['pid', 'combined_hash'])[short_name_visit_reasons].sum()

call_visit[short_name_visit_reasons] = call_visit[short_name_visit_reasons].divide(call_visit.sum(axis=1), axis='rows')

In [258]:
call_visit = call_visit.add_prefix('call_')
call_visit.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,call_visit_reason:entertainment,call_visit_reason:errand,call_visit_reason:home,call_visit_reason:work,call_visit_reason:exercise,call_visit_reason:dining,call_visit_reason:socialize,call_visit_reason:travel/traffic,call_visit_reason:other
pid,combined_hash,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1002060,100206037bc00d68a24a359c7e5c7fc0c7bf7b8,0.045455,0.045455,0.909091,0.0,0.0,0.0,0.0,0.0,0.0
1002060,10020604dee72583ac5647caf9d876b53ca158c,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1002060,1002060632572ef12203e84583c0cab0295337f,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1002060,10020607fbbe92349588238af4c0417afa1d6d0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1002060,1002060b07d836f246e50ce930bf90a9fe33939,0.0,0.028169,0.971831,0.0,0.0,0.0,0.0,0.0,0.0


In [260]:
final_df = pickle.load(open('../data/testing_loc_train_features.df', 'rb'))

In [264]:
final_df.filter(regex="sms_visit.*")

Unnamed: 0,sms_visit_reason:entertainment,sms_visit_reason:errand,sms_visit_reason:home,sms_visit_reason:work,sms_visit_reason:exercise,sms_visit_reason:dining,sms_visit_reason:socialize,sms_visit_reason:travel/traffic,sms_visit_reason:other,sms_visit_reason:entertainment_nan_indicator,sms_visit_reason:errand_nan_indicator,sms_visit_reason:home_nan_indicator,sms_visit_reason:work_nan_indicator,sms_visit_reason:exercise_nan_indicator,sms_visit_reason:dining_nan_indicator,sms_visit_reason:socialize_nan_indicator,sms_visit_reason:travel/traffic_nan_indicator,sms_visit_reason:other_nan_indicator
0,0.024390,0.024390,0.951220,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
1,0.000000,0.108696,0.891304,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
3,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0
4,0.000000,0.027027,0.594595,0.351351,0.000000,0.000000,0.027027,0.000000,0.000000,0,0,0,0,0,0,0,0,0
5,0.000000,0.000000,0.820896,0.164179,0.000000,0.000000,0.014925,0.000000,0.000000,0,0,0,0,0,0,0,0,0
6,0.000000,0.007937,0.952381,0.000000,0.000000,0.007937,0.000000,0.007937,0.023810,0,0,0,0,0,0,0,0,0
7,0.000000,0.018081,0.951321,0.000000,0.000000,0.008345,0.021558,0.000000,0.000695,0,0,0,0,0,0,0,0,0
8,0.000000,0.006944,0.986111,0.000000,0.000000,0.000000,0.006944,0.000000,0.000000,0,0,0,0,0,0,0,0,0
9,0.000000,0.024242,0.969697,0.000000,0.000000,0.000000,0.000000,0.000000,0.006061,0,0,0,0,0,0,0,0,0


### TODOs:
1. ~~dump coe.csv data, with added location tag, into df~~
2. ~~add option in main extract_features module to do an outer merge~~
3. process additional features

4. process un-labelled clusters (top 5 places) and label the columns
    - this can be a follow-up: the semantic locations were "easy" features

### visit locations

{Home, Work, Another's Home, Arts & Entertainment (Theater, Music Venue, Etc.), Food (Restaurant, Cafe), Nightlife Spot (Bar, Club), Outdoors & Recreation, Gym or Other Exercise, Professional or Medical Office, Spiritual (Church, Temple, Etc.), Shop or Store, Travel or Transport (Airport, Bus Stop, Train Station, Etc.), Vehicle, Other (Not Listed)}

### visit reasons

{Entertainment, Errand, Home, Work, Exercise, Dining, Socialize, Travelling/Traffic}

### Notes
- 10 total contacts with no associated semantic location