# Call length feature extraction

In [241]:
import csv
import math
import multiprocessing
import os
import pickle
from sys import exit
import shutil

import numpy as np
import pandas as pd

from df_utils import *


In [253]:
def process_cal(cal_df, pid):
    """
    Processes the cal.csv call logs to determine call duration.
    
    """
    cols = ['pid', 'ring_start', 'ring_end', 'ring_duration', 'call_start', 'call_end', 'call_duration']
    
    start_call_ts = 0
    end_call_ts = 0
    start_ring_ts = np.nan
    end_ring_ts = np.nan
    in_call = False
    is_ring = False
    prev_state = None
    call_rows = []
    for idx, row in cal_df.iterrows():
        # starts ringing
        if (not in_call) and row['call_state'] == 'Ringing':
            is_ring = True
            start_ring_ts = row['timestamp']
        
        # call begins
        elif (not in_call) and (row['call_state'] == 'Off-Hook'):
            # pick
            if prev_state == 'Ringing':
                end_ring_ts = row['timestamp']
            start_call_ts = row['timestamp']
            in_call = True
        
        # call ends
        elif in_call and (row['call_state'] == 'Idle'):
            in_call = False
            end_call_ts = row['timestamp']
            if is_ring:
                is_ring = False
            else:
                start_ring_ts = np.nan
                end_ring_ts = np.nan
            call_rows.append([pid, start_ring_ts, end_ring_ts, end_ring_ts - start_ring_ts, start_call_ts, end_call_ts, end_call_ts - start_call_ts])

        # the missed call case
        elif (not in_call) and (row['call_state'] == 'Idle'):
            is_ring = False
            start_ring_ts = np.nan
            end_ring_ts = np.nan
            
        prev_state = row['call_state']
    
    proc_df = pd.DataFrame(call_rows, columns=cols)

    return proc_df
        
    
def match_dates(phone_df, cal_df):
    """Matches call length from cal_df to the given logs in phone_df.
    
    """
    cols = ['combined_hash', 'timestamp', 'ring_duration', 'call_duration']
    combined_rows = []
    for idx, row in phone_df.iterrows():
        ts = math.ceil(row['timestamp'])
        cur_call = row['comm_direction']
        if cur_call in ['INCOMING', 'OUTGOING']:
            for cal_idx, cal_row in cal_df.iterrows():
                start = cal_row['ring_start'] if not np.isnan(cal_row['ring_start']) else cal_row['call_start']
                start -= 30
                end = cal_row['call_end']+1

                if (start <= ts) and (end >= ts):
                    data = [row['combined_hash'], row['timestamp'], cal_row['ring_duration'], cal_row['call_duration']]
                    combined_rows.append(data)

                    break
    pid_df = pd.DataFrame(combined_rows, columns=cols)
    return pid_df


def extract_cal_data(data_dir, subj, testing=False):
    """Extracts call length information for the given subject.
    
    """
    filename = data_dir + subj + '/cal.csv'
    if os.path.exists(filename):
        with open(filename) as file_in:
            raw_df = pd.read_csv(file_in, delimiter='\t', header=None, names=call_cols)
            cal_df = process_cal(raw_df, subj)
            pid_coe = coe_df.loc[coe_df['pid'] == subj]
            phone_df = pid_coe.loc[pid_coe['comm_type'] == 'PHONE']
            
            return match_dates(phone_df, cal_df)

In [247]:
coe_df = pickle.load(open('../data/top_5_contacts_loc_final.df', 'rb'))

In [256]:
loc_coe_df = pd.DataFrame()
data_dir = '../CS120/CS120-sensor-csvs/'

subjects = os.listdir(data_dir)
call_cols = ['timestamp', 'call_state']

coe_df = coe_df.drop_duplicates(subset=['date', 'pid', 'combined_hash'])

cal_df = pd.DataFrame(columns=call_cols)

func_args = [(data_dir, subj) for subj in subjects]

with multiprocessing.Pool(processes=4) as pool:
    results = pool.starmap(extract_cal_data, func_args)

final_df = pd.DataFrame()
for df in results:
    final_df = final_df.append(df)


In [257]:
final_df.shape

(15233, 4)

In [258]:
final_df.head()

Unnamed: 0,combined_hash,timestamp,ring_duration,call_duration
0,1002060b07d836f246e50ce930bf90a9fe33939,1446589000.0,13.0,148.0
1,1002060b07d836f246e50ce930bf90a9fe33939,1446589000.0,6.0,354.0
2,1002060b07d836f246e50ce930bf90a9fe33939,1446592000.0,,50.0
3,1002060b07d836f246e50ce930bf90a9fe33939,1446592000.0,7.0,218.0
4,1002060b07d836f246e50ce930bf90a9fe33939,1446750000.0,17.0,53.0


In [259]:
pickle.dump(final_df, open("all_cal.df", "wb"), -1)

In [260]:
coe_df.shape

(283624, 40)

In [261]:
coe_cal_df = coe_df.merge(final_df, on=['combined_hash', 'timestamp'], how='outer')

In [267]:
((coe_cal_df['comm_type'] == 'PHONE') & (coe_cal_df['comm_direction'] == 'OUTGOING')).sum()

12089

In [268]:
pickle.dump(coe_cal_df, open("top_5_contacts_cal.df", "wb"), -1)

## Mean, median, max weekly sandbox

In [337]:
group_key = ['pid', 'combined_hash', pd.Grouper(key='date_tz', freq='W')]
wk_counts = coe_df.groupby(group_key).count()
display(wk_counts.groupby(level=[0,1]).median().reset_index().head())
display(wk_counts.groupby(level=[0,1]).min().head())
display(wk_counts.groupby(level=[0,1]).max().head())

#wk_counts.loc[wk_counts.groupby(level=0).idxmin()]

Unnamed: 0,pid,combined_hash,comm_direction,comm_type,contact_name,contact_number,date,timestamp,day,hour,...,other,visit_reason:entertainment,visit_reason:errand,visit_reason:home,visit_reason:work,visit_reason:exercise,visit_reason:dining,visit_reason:socialize,visit_reason:travel/traffic,visit_reason:other
0,1002060,100206037bc00d68a24a359c7e5c7fc0c7bf7b8,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,0.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
1,1002060,10020604dee72583ac5647caf9d876b53ca158c,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,0.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
2,1002060,1002060632572ef12203e84583c0cab0295337f,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,...,0.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
3,1002060,10020607fbbe92349588238af4c0417afa1d6d0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,1002060,1002060b07d836f246e50ce930bf90a9fe33939,18.5,18.5,18.5,18.5,18.5,18.5,18.5,18.5,...,0.0,18.5,18.5,18.5,18.5,18.5,18.5,18.5,18.5,18.5


Unnamed: 0_level_0,Unnamed: 1_level_0,comm_direction,comm_type,contact_name,contact_number,date,timestamp,day,hour,hour_wk,contact_type,...,other,visit_reason:entertainment,visit_reason:errand,visit_reason:home,visit_reason:work,visit_reason:exercise,visit_reason:dining,visit_reason:socialize,visit_reason:travel/traffic,visit_reason:other
pid,combined_hash,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1002060,100206037bc00d68a24a359c7e5c7fc0c7bf7b8,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
1002060,10020604dee72583ac5647caf9d876b53ca158c,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
1002060,1002060632572ef12203e84583c0cab0295337f,6,6,6,6,6,6,6,6,6,6,...,0,6,6,6,6,6,6,6,6,6
1002060,10020607fbbe92349588238af4c0417afa1d6d0,2,2,2,2,2,2,2,2,2,2,...,0,2,2,2,2,2,2,2,2,2
1002060,1002060b07d836f246e50ce930bf90a9fe33939,2,2,2,2,2,2,2,2,2,2,...,0,2,2,2,2,2,2,2,2,2


Unnamed: 0_level_0,Unnamed: 1_level_0,comm_direction,comm_type,contact_name,contact_number,date,timestamp,day,hour,hour_wk,contact_type,...,other,visit_reason:entertainment,visit_reason:errand,visit_reason:home,visit_reason:work,visit_reason:exercise,visit_reason:dining,visit_reason:socialize,visit_reason:travel/traffic,visit_reason:other
pid,combined_hash,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1002060,100206037bc00d68a24a359c7e5c7fc0c7bf7b8,28,28,28,28,28,28,28,28,28,28,...,0,28,28,28,28,28,28,28,28,28
1002060,10020604dee72583ac5647caf9d876b53ca158c,39,39,39,39,39,39,39,39,39,39,...,0,39,39,39,39,39,39,39,39,39
1002060,1002060632572ef12203e84583c0cab0295337f,27,27,27,27,27,27,27,27,27,27,...,0,27,27,27,27,27,27,27,27,27
1002060,10020607fbbe92349588238af4c0417afa1d6d0,32,32,32,32,32,32,32,32,32,32,...,0,32,32,32,32,32,32,32,32,32
1002060,1002060b07d836f246e50ce930bf90a9fe33939,39,39,39,39,39,39,39,39,39,39,...,0,39,39,39,39,39,39,39,39,39


In [335]:
test_df = pickle.load(open("../data/top_5_baseline_new_test_train_features.df", "rb"))

In [336]:
test_df.loc[test_df['combined_hash'] == '12037250d8a6e6eb6122d08aefe766b479f0da3'][['min_in_sms', 'med_in_sms', 'max_in_sms']]

Unnamed: 0,min_in_sms,med_in_sms,max_in_sms
220,14.0,137.0,229.0


In [None]:
test_df

In [329]:
debug_df = coe_df.loc[coe_df['combined_hash'] == '12037250d8a6e6eb6122d08aefe766b479f0da3']
debug_df['date_days'] = pd.DatetimeIndex(debug_df['date_tz']).normalize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [332]:
debug_in_sms = debug_df.loc[(debug_df['comm_type'] == 'SMS') & (debug_df['comm_direction'] == 'INCOMING')]

In [333]:
group_key = ['pid', 'combined_hash', pd.Grouper(key='date_days', freq='W')]
wk_counts = debug_in_sms.groupby(group_key)['contact_type'].count()
display(wk_counts)
display(wk_counts.groupby(level=[0,1]).median().reset_index().head())
display(wk_counts.groupby(level=[0,1]).min().head())
display(wk_counts.groupby(level=[0,1]).max().head())

pid      combined_hash                            date_days 
1203725  12037250d8a6e6eb6122d08aefe766b479f0da3  2015-11-08     40
                                                  2015-11-15    122
                                                  2015-11-22    135
                                                  2015-11-29    180
                                                  2015-12-06    139
                                                  2015-12-13     14
                                                  2015-12-20    229
                                                  2015-12-27    170
                                                  2016-01-03    165
                                                  2016-01-10     25
Name: contact_type, dtype: int64

Unnamed: 0,pid,combined_hash,contact_type
0,1203725,12037250d8a6e6eb6122d08aefe766b479f0da3,137


pid      combined_hash                          
1203725  12037250d8a6e6eb6122d08aefe766b479f0da3    14
Name: contact_type, dtype: int64

pid      combined_hash                          
1203725  12037250d8a6e6eb6122d08aefe766b479f0da3    229
Name: contact_type, dtype: int64