In [1]:
import sys
sys.path.append("../../")
import numpy as np
import pandas as pd
from datetime import datetime as dt
from collections import defaultdict
%matplotlib inline
import matplotlib.pylab as plt
import os
from sklearn.preprocessing import scale
import bandicoot_dev as bc_d
from build_dataset.workers import load_sensible_data as lsd
from build_dataset.analysis import location_reference as locref
from build_dataset.analysis import timezone_reference as tzref
#from build_dataset.workers import apply_timezone_offset as ato

In [2]:
tc0 = {
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("06/01/14", "24/01/14"),
        ("03/02/14", "16/05/14"),
        ("01/09/14", "05/12/14"),
        ("02/06/14", "20/06/14")
    ]
}
tc1 = {
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("17/05/14", "01/06/14"),
        ("06/12/14", "21/12/14")
    ]
}
tc2 = {
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("01/01/14", "05/01/14"),
        ("25/01/14", "02/02/14"),
        ("14/04/14", "20/04/14"),
        ("21/06/14", "30/08/14"),
        ("22/12/14", "31/12/14")
    ]
}
tc3 = {
    'hours': range(24),
    'days': range(7),
    'spans': [
        ("01/01/14", "31/12/14")
    ]
}

In [4]:
location_reference_tc0 = locref.Load_location_reference(tc0).location_reference
location_reference_tc1 = locref.Load_location_reference(tc1).location_reference
location_reference_tc2 = locref.Load_location_reference(tc2).location_reference
location_reference_tc3 = locref.Load_location_reference(tc3).location_reference

In [6]:
df_call = lsd.load(tc0, "calllog")
df_sms = lsd.load(tc0, "sms")
#df_screen = ato.apply(lsd.load(tc0, "screen"), tc0)
#df_stop_locations = lsd.load(tc0, "stop_locations")
#df_bt = ato.apply(lsd.load(tc0, "bluetooth", filtering="bt_special"), tc0)

In [7]:
users = set(list(df_call['user'])) & set(list(df_sms['user']))

In [5]:
def _filter_call(df_u):
    df_u = df_u.drop('user', 1)
    df_u['interaction'] = "call"
    df_u['timestamp'] = [dt.fromtimestamp(t) for t in df_u['timestamp']]
    df_u['type'] = ["in" if t == 1 else "out" for t in df_u['type']]
    df_u.columns = ["duration", "correspondent_id", "datetime", "direction", "interaction"]
    return df_u

def _filter_sms(df_u):
    df_u = df_u[df_u['status'] <= 0]
    df_u = df_u[df_u['type'] <= 2]
    df_u = df_u.drop(['status','user'], 1)
    df_u['interaction'] = 'text'
    df_u['timestamp'] = [dt.fromtimestamp(t) for t in df_u['timestamp']]
    df_u['type'] = ["in" if t == 1 else "out" for t in df_u['type']]
    df_u['duration'] = ''
    df_u.columns = ["correspondent_id", "datetime", "direction", "interaction", "duration"]
    return df_u

def _filter_physical(u, df_u):
    df_u['interaction'] = 'physical'
    df_u['direction'] = ''
    df_u['timestamp'] = [dt.fromtimestamp(t) for t in df_u['timestamp']]
    df_u['correspondent_id'] = [a if a != u else b 
                                for a,b in zip(df_u['bt_mac'].values, df_u['user'].values)]
    df_u['duration'] = ""
    df_u = df_u.drop(['class','id','bt_mac','rssi', 'user'], 1)
    df_u.columns = ["datetime", "interaction", "direction", "correspondent_id", "duration"]
    return df_u

def _filter_screen(df_u):
    sessions = []
    i = 0
    for row in df_u.iterrows():
        event, times = row[1]['screen_on'], row[1]['timestamp']
        if i == 0:
            prev_event = event
            prev_times = times
            i+=1; continue
        elif event == 0 and prev_event == 1:
            duration = (times-prev_times)/1000
            sessions.append({'datetime': dt.fromtimestamp(prev_times/1000), 
                             'duration': duration})
        prev_event = event
        prev_times = times
        i+=1
    df_u = pd.DataFrame(sessions)
    df_u['interaction'] = 'screen'
    return df_u

def _filter_stop_locations(u, df_u):
    def evaluate_event(r):
        user, label, arrival, departure = \
        r[1]['user'], r[1]['label'], r[1]['arrival'], r[1]['departure']
        
        state = location_reference_tc0[str(user)]['%.1f'%label]
        if state['type'] == "home":
            return "home"
        if state['type'] == "campus":
            if state['__friday_bar'] and dt.fromtimestamp(np.mean([arrival,departure])).hour >=17:
                return "friday_bar"
            return "campus"
        return "other"
        
    df_u['duration'] = df_u['departure'] - df_u['arrival']
    df_u['datetime'] = [dt.fromtimestamp(t) for t in df_u['arrival']]
    df_u['position'] = ["%d_%s" % (u, l) for l in df_u['label']]
    df_u['event'] = [evaluate_event(r) for r in df_u.iterrows()]
    df_u = df_u.drop(['arrival', 'departure', 'label', 'lat', 'lon', 'timestamp', 'user'], 1)
    return df_u
    

for u in users:
    df_call_u = _filter_call(df_call[df_call['user'] == u])
    df_sms_u = _filter_sms(df_sms[df_sms['user'] == u])
    #df_physical_u = _filter_physical(u, df_physical[(df_physical['user'] == u) | (df_physical['bt_mac'] == u)])
    #df_screen_u = _filter_screen(df_screen[df_screen['user'] == u])
    #df_stop_locations_u = _filter_stop_locations(u, df_stop_locations[df_stop_locations['user'] == u])
    
    df_cellular = pd.concat([df_sms_u, df_call_u]).sort(['datetime'], ascending=1)
    df_cellular.to_csv("../data_cache/records/cellular/%d.csv" % u, 
                          index=False)
    #df_physical.to_csv("physical/%d.csv" % u, index=False)
    #df_screen.to_csv("screen/%d.csv" % u, index=False)
    #df_stop_locations.to_csv(ROOTPATH + \
    #                         "build_dataset/data_cache/records/stop_locations/%d.csv" % u, 
    #                         index=False)
    
    if u%10==0:
        print u,

0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 550 560 570 580 590 600 610 620 630 640 660 670 680 690 700 710 720 730 760 780 790 800 810 820 830 840




In [79]:
def compute_features(userid, groupby="week", summary="special"):
    
    user = bc_d.read_csv("%d" % userid, 
                         "../data_cache/records/cellular/", 
                         network=False,
                         describe=False)
    
    indicators = bc_d.utils.all(user, 
                                groupby=groupby, 
                                summary=summary, 
                                dist=True, 
                                network=True, 
                                spatial=False)
    
    for ex in ['name', 'reporting']:
        del indicators[ex]
        
    return bc_d.utils.flatten(indicators)

In [80]:
for user in sorted(users):
    
    ds = compute_features(user)
    
    # Initiate
    if int(user) == 0:
        feat_names = []
        M = len(ds) + sum([1 for v in ds.values() if type(v[0]) is dict])
        N = len(ds.values()[0])
        all_X = [np.empty((0,M))] * N
    
    # Extract features for user in matrix format. The matrix feat_vects has
    # a row vector of feature values for each week.
    feat_vects = np.ones((N,M))
    
    c = 0
    for j, (f, vals) in enumerate(ds.items()):
        
        if user == 0:
            if type(vals[0]) is dict:
                feat_names.append(f+"_mean")
                feat_names.append(f+"_std")
            else:
                feat_names.append(f)
            
        for i, v in enumerate(vals):
            if type(v) is dict:
                feat_vects[i,j+c] = v['mean']
                feat_vects[i,j+c+1] = v['std']
                if i == 0: c += 1
            else:
                feat_vects[i, j+c] = v
        
    for i in range(N):
        all_X[i] = np.append(all_X[i], [feat_vects[i, :]], axis=0)



UnboundLocalError: local variable 'bad_records' referenced before assignment

In [122]:
all_X_clean = [X[:,~np.isnan(X).any(0)] for X in all_X]
all_X_scaled = [scale(X) for X in all_X_clean]

In [123]:
for i in range(len(all_X)):
    print all_X_clean[i].shape

(762, 33)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)
(762, 34)


In [130]:
for i in range(all_X[0].shape[0]):
    if np.isnan(all_X[0][i, :]).any():
        print i,
        
print "\n"
        
for j in range(all_X[0].shape[1]):
    if np.isnan(all_X[0][:, j]).any():
        print feat_names[j]

7 9 25 34 54 68 93 101 108 110 132 141 154 179 196 198 246 309 325 410 421 427 431 445 446 471 492 507 522 526 587 596 613 618 622 636 639 658 669 675 683 696 705 711 712 713 724 754 761 

response_delay_text__allweek__allday__callandtext_mean
response_delay_text__allweek__allday__callandtext_std


Convert all_X to a 3D-array instead of a list of arrays, and interpolate nans across weeks of participants (y direction), not across attribute values of participants.

In [161]:
for j in range(all_X[3].shape[1]):
    mask_j = np.isnan(all_X[3][:,j])
    all_X[3][mask_j,j] = np.interp(np.flatnonzero(mask_j), np.flatnonzero(~mask_j), all_X[3][~mask_j,j])

In [163]:
def interpolate_nans(X):
    """Overwrite NaNs with column value interpolations."""
    for j in range(X.shape[1]):
        mask_j = np.isnan(X[:,j])
        X[mask_j,j] = np.interp(np.flatnonzero(mask_j), np.flatnonzero(~mask_j), X[~mask_j,j])
    return X

In [167]:
X_incomplete = np.array([[10,     20,     30    ],
                         [np.nan, 30,     np.nan],
                         [np.nan, np.nan, 50    ],
                         [40,     50,     np.nan    ]])

In [168]:
X_complete = interpolate_nans(X_incomplete)
print X_complete

[[ 10.  20.  30.]
 [ 20.  30.  40.]
 [ 30.  40.  50.]
 [ 40.  50.  50.]]


In [7]:
meta = ['name', 'reporting']
good = ['active_days', 'number_of_contacts', 'duration',
        'percent_initiated_conversations', 'percent_initiated_interactions', 
        'response_delay_text', 'response_rate_text', 'entropy_of_contacts', 
        'interevent_time', 'percent_pareto_interactions', 'percent_pareto_durations', 
        'percent_interactions_out', 'percent_concluded_conversations', 'percent_overlap_conversations']
maybe = ['balance_of_contacts', 'number_of_interactions']
work = ['percent_nocturnal']
drop = ['interactions_per_contact']

for ex in meta:# + good + work + drop + maybe:
    del indicators_d[ex]

indicators_flat = bc_d.utils.flatten(indicators_d)

In [72]:
x = np.array([[9000, 9001, 9002], [9010, 9011, 9012], [9020, 9021, 9022]], np.int32)
y = np.array([[9100, 9101, 9102], [9110, 9111, 9112], [9120, 9121, 9122]], np.int32)
print x
print y

[[9000 9001 9002]
 [9010 9011 9012]
 [9020 9021 9022]]
[[9100 9101 9102]
 [9110 9111 9112]
 [9120 9121 9122]]


In [73]:
Z = np.dstack((x,y))
print Z

[[[9000 9100]
  [9001 9101]
  [9002 9102]]

 [[9010 9110]
  [9011 9111]
  [9012 9112]]

 [[9020 9120]
  [9021 9121]
  [9022 9122]]]


In [76]:
Z[1,0,1]

9110