### Raw data

In [5]:
"""Run all functions that load and cache data from remote."""

from build_dataset.workers import load_sensible_data as lsd


tc0 = {'hours': range(24),
    'days': range(7),
    'spans': [
        ("01/01/14", "31/12/14")
    ]
}
tc1 = {  # School weeks spring and fall 2014
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("06/01/14", "24/01/14"),
        ("03/02/14", "13/04/14"),
        ("20/04/14", "16/05/14"),
        ("02/06/14", "20/06/14"),
        ("01/09/14", "13/10/14"),
        ("19/10/14", "05/12/14")
    ]
}
tc1_spring = {  # School weeks spring 2014
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("06/01/14", "24/01/14"),
        ("03/02/14", "13/04/14"),
        ("20/04/14", "16/05/14"),
        ("02/06/14", "20/06/14"),
    ]
}
tc2 = {  # Exam weeks spring and fall 2014
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("17/05/14", "01/06/14"),
        ("06/12/14", "21/12/14")
    ]
}
tc3 = {  # Holiday weeks spring and fall 2014
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("01/01/14", "05/01/14"),
        ("25/01/14", "02/02/14"),
        ("14/04/14", "20/04/14"),
        ("21/06/14", "30/08/14"),
        ("13/10/14", "19/10/14"),
        ("22/12/14", "31/12/14")
    ]
}

for i, tc in enumerate([tc1_spring]): #enumerate([tc0, tc1, tc1_spring, tc2, tc3]):
    print "\n\n## ---------------- ##"
    print "## Building for tc%d ##" % i
    print "## ---------------- ##"

    print "\nLocation Reference..."
    print "\nsuccess!\n"
    print "Timezone Reference..."
    print "\nsuccess!\n"

    print "Iterating over datasets:"
    for dataset in ["calllog", "sms", "screen", "stop_locations", "bluetooth"]:
        print "\tBuilding '%s'...\t\t" % dataset,
        if dataset == "bluetooth":
            lsd.load(tc, dataset, filtering="bt_special", load_cached=True)
        else:
            pass
        print "\n\tsuccess!\n"

 

## ---------------- ##
## Building for tc0 ##
## ---------------- ##

Location Reference...

success!

Timezone Reference...

success!

Iterating over datasets:
	Building 'calllog'...		
	success!

	Building 'sms'...		
	success!

	Building 'screen'...		
	success!

	Building 'stop_locations'...		
	success!

	Building 'bluetooth'...		
	success!



### Records

In [7]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
from collections import Counter
import os
from build_dataset.workers import load_sensible_data as lsd
from build_dataset.analysis import location_reference as locref

tc0 = {
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("01/01/14", "31/12/14")
    ]
}

location_reference_tc0 = locref.Load_location_reference(tc0).location_reference

df_call = lsd.load(tc0, "calllog")
df_text = lsd.load(tc0, "sms")
df_screen = lsd.load(tc0, "screen")
df_stops = lsd.load(tc0, "stop_locations")
df_physical = lsd.load(tc0, "bluetooth")

users = set(list(df_call['user'])) & set(list(df_text['user'])) & set(list(df_screen['user'])) & \
        set(list(df_stops['user'])) & set(list(df_physical['user']))
    
def call_timer(func):
    """Simple decorator to time function calls."""
    def wrapper(*args):
        print func.__name__,
        start = dt.now()
        output = func(*args)
        print "time:", dt.now() - start
        return output
    return wrapper
    
@call_timer
def _filter_call(df_u):
    df_u = df_u.drop('user', 1)
    df_u.loc[:, 'interaction'] = "call"
    df_u.loc[:, 'timestamp'] = [dt.fromtimestamp(t) for t in df_u.loc[:, 'timestamp']]
    df_u.loc[:, 'type'] = ["in" if t == 1 else "out" for t in df_u.loc[:, 'type']]
    df_u.columns = ["duration", "correspondent_id", "datetime", "direction", "interaction"]
    return df_u

@call_timer
def _filter_text(df_u):
    df_u = df_u[df_u.loc[:, 'status'] <= 0]
    df_u = df_u[df_u.loc[:, 'type'] <= 2]
    df_u = df_u.drop(['status','user'], 1)
    df_u.loc[:, 'interaction'] = 'text'
    df_u.loc[:, 'timestamp'] = [dt.fromtimestamp(t) for t in df_u.loc[:, 'timestamp']]
    df_u.loc[:, 'type'] = ["in" if t == 1 else "out" for t in df_u.loc[:, 'type']]
    df_u.columns = ["correspondent_id", "datetime", "direction", "interaction"]
    return df_u

@call_timer
def _filter_physical(u, df_u):
    df_u.loc[:, 'interaction'] = 'physical'
    df_u.loc[:, 'timestamp'] = [dt.fromtimestamp(t) for t in df_u.loc[:, 'timestamp']]
    df_u.loc[:, 'correspondent_id'] = [a if a != u else b 
                                for a,b in zip(df_u.loc[:, 'bt_mac'], df_u.loc[:, 'user'])]
    df_u = df_u.drop(['class','id','bt_mac','rssi', 'user'], 1)
    df_u.columns = ["datetime", "interaction", "correspondent_id"]
    return df_u

@call_timer
def _filter_screen(df_u):
    sessions = []
    i = 0
    for row in df_u.iterrows():
        event, times = row[1]['screen_on'], row[1]['timestamp']
        if i == 0:
            prev_event = event
            prev_times = times
            i+=1; continue
        elif event == 0 and prev_event == 1:
            duration = int(times-prev_times)
            sessions.append({'datetime': dt.fromtimestamp(prev_times), 
                             'duration': duration})
        prev_event = event
        prev_times = times
        i+=1
    df_u = pd.DataFrame(sessions)
    df_u.loc[:, 'interaction'] = 'screen'
    return df_u

@call_timer
def _filter_stops(u, df_u):
    def evaluate_event(r):
        user, label, arrival, departure = \
        r[1]['user'], r[1]['label'], r[1]['arrival'], r[1]['departure']
        
        state = location_reference_tc0[str(user)][str(label)]
        if state['type'] == "home":
            return "home"
        if state['type'] == "campus":
            if state['__friday_bar'] and dt.fromtimestamp(np.mean([arrival,departure])).hour >=17:
                return "friday_bar"
            return "campus"
        return "other"
        
    df_u.loc[:, 'duration'] = df_u.loc[:, 'departure'] - df_u.loc[:, 'arrival']
    df_u.loc[:, 'datetime'] = [dt.fromtimestamp(t) for t in df_u.loc[:, 'arrival']]
    df_u.loc[:, 'position'] = ["%d_%s" % (u, l) for l in df_u.loc[:, 'label']]
    df_u.loc[:, 'event'] = [evaluate_event(r) for r in df_u.iterrows()]
    df_u.loc[:, 'interaction'] = 'stop'
    df_u = df_u.drop(['arrival', 'departure', 'label', 'lat', 'lon', 'timestamp', 'user'], 1)
    return df_u


def processed_users(tc):
    interaction_types = ["call", "text", "physical", "screen", "stop"]
    counter = Counter()
    for t in interaction_types:
        users = [int(f.split(".")[0]) for f in os.listdir("build_dataset/data_cache/records/%s/%s" % (t, tc))]
        counter.update(users)
    return [k for k,v in counter.items() if v == 5]
    

tc = "tc1_spring"
skip_users = processed_users(tc)
for u in users:
    
    if u in skip_users:
        continue
    
    print "\nuser:", u
    try:
        df_call_u = _filter_call(df_call[df_call.loc[:, 'user'] == u])
        df_text_u = _filter_text(df_text[df_text.loc[:, 'user'] == u])
        df_physical_u = _filter_physical(u, df_physical.loc[(df_physical.loc[:, 'user'] == u) | \
                                                            (df_physical.loc[:, 'bt_mac'] == u), :])
        df_screen_u = _filter_screen(df_screen[df_screen.loc[:, 'user'] == u])
        df_stop_u = _filter_stops(u, df_stops[df_stops.loc[:, 'user'] == u])
    
        df_call_u.to_csv("build_dataset/data_cache/records/call/%s/%d.csv" % (tc, u), index=False)
        df_text_u.to_csv("build_dataset/data_cache/records/text/%s/%d.csv" % (tc, u), index=False)
        df_physical_u.to_csv("build_dataset/data_cache/records/physical/%s/%d.csv" % (tc, u), index=False)
        df_screen_u.to_csv("build_dataset/data_cache/records/screen/%s/%d.csv" % (tc, u), index=False)
        df_stop_u.to_csv("build_dataset/data_cache/records/stop/%s/%d.csv" % (tc, u), index=False)
    except:
        print "Failed:", u
    


user: 0
_filter_call time: 0:00:00.027298
_filter_text 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


time: 0:00:00.026823
_filter_physical time: 0:00:00.668389
_filter_screen time: 0:00:02.185128
_filter_stops time: 0:00:01.180554

user: 1
_filter_call time: 0:00:00.002656
_filter_text time: 0:00:00.004740
_filter_physical time: 0:00:00.570920
_filter_screen time: 0:00:00.294297
_filter_stops time: 0:00:01.113185

user: 2
_filter_call time: 0:00:00.002730
_filter_text time: 0:00:00.015357
_filter_physical time: 0:00:00.541450
_filter_screen time: 0:00:00.395491
_filter_stops time: 0:00:01.101134

user: 3
_filter_call time: 0:00:00.005636
_filter_text time: 0:00:00.027341
_filter_physical time: 0:00:00.584631
_filter_screen time: 0:00:01.431171
_filter_stops time: 0:00:01.146566

user: 4
_filter_call time: 0:00:00.007566
_filter_text time: 0:00:00.017125
_filter_physical time: 0:00:00.584015
_filter_screen time: 0:00:00.931094
_filter_stops time: 0:00:01.138041

user: 5
_filter_call time: 0:00:00.005730
_filter_text time: 0:00:00.075940
_filter_physical time: 0:00:00.599614
_filter_scr

# Build dataset for science competition

In [1]:
from build_dataset.workers import load_sensible_data as lsd

In [10]:
tc0 = {'hours': range(24),
    'days': range(7),
    'spans': [
        ("03/02/14", "10/02/14")
    ]
}




In [15]:
dataset = lsd.load(tc0, "bluetooth", filtering="bt_special", load_cached=False)

<2014> feb


In [49]:
ds = dataset.sort_values(by="timestamp")
ds = ds.loc[dataset['bt_mac'] >= 0]
#ds['timestamp'] = ds['timestamp'] - min(ds['timestamp'])
#ds['user1'] = ds['user']
#ds['user2'] = ds['bt_mac']
#ds = ds.drop(['class', 'id', 'rssi', 'bt_mac', 'user'], axis=1)
#ds = ds[['user1', 'user2', 'timestamp']]
#csv = ds.to_csv("data_sample_science_comp_2016.csv", index=False, sep=",")

In [51]:
min(ds['timestamp'])%86400

0.0