# Call length feature extraction

In [241]:
import csv
import math
import multiprocessing
import os
import pickle
from sys import exit
import shutil

import numpy as np
import pandas as pd

from df_utils import *


In [253]:
def process_cal(cal_df, pid):
    """
    Processes the cal.csv call logs to determine call duration.
    
    """
    cols = ['pid', 'ring_start', 'ring_end', 'ring_duration', 'call_start', 'call_end', 'call_duration']
    
    start_call_ts = 0
    end_call_ts = 0
    start_ring_ts = np.nan
    end_ring_ts = np.nan
    in_call = False
    is_ring = False
    prev_state = None
    call_rows = []
    for idx, row in cal_df.iterrows():
        # starts ringing
        if (not in_call) and row['call_state'] == 'Ringing':
            is_ring = True
            start_ring_ts = row['timestamp']
        
        # call begins
        elif (not in_call) and (row['call_state'] == 'Off-Hook'):
            # pick
            if prev_state == 'Ringing':
                end_ring_ts = row['timestamp']
            start_call_ts = row['timestamp']
            in_call = True
        
        # call ends
        elif in_call and (row['call_state'] == 'Idle'):
            in_call = False
            end_call_ts = row['timestamp']
            if is_ring:
                is_ring = False
            else:
                start_ring_ts = np.nan
                end_ring_ts = np.nan
            call_rows.append([pid, start_ring_ts, end_ring_ts, end_ring_ts - start_ring_ts, start_call_ts, end_call_ts, end_call_ts - start_call_ts])

        # the missed call case
        elif (not in_call) and (row['call_state'] == 'Idle'):
            is_ring = False
            start_ring_ts = np.nan
            end_ring_ts = np.nan
            
        prev_state = row['call_state']
    
    proc_df = pd.DataFrame(call_rows, columns=cols)

    return proc_df
        
    
def match_dates(phone_df, cal_df):
    """Matches call length from cal_df to the given logs in phone_df.
    
    """
    cols = ['combined_hash', 'timestamp', 'ring_duration', 'call_duration']
    combined_rows = []
    for idx, row in phone_df.iterrows():
        ts = math.ceil(row['timestamp'])
        cur_call = row['comm_direction']
        if cur_call in ['INCOMING', 'OUTGOING']:
            for cal_idx, cal_row in cal_df.iterrows():
                start = cal_row['ring_start'] if not np.isnan(cal_row['ring_start']) else cal_row['call_start']
                start -= 30
                end = cal_row['call_end']+1

                if (start <= ts) and (end >= ts):
                    data = [row['combined_hash'], row['timestamp'], cal_row['ring_duration'], cal_row['call_duration']]
                    combined_rows.append(data)

                    break
    pid_df = pd.DataFrame(combined_rows, columns=cols)
    return pid_df


def extract_cal_data(data_dir, subj, testing=False):
    """Extracts call length information for the given subject.
    
    """
    filename = data_dir + subj + '/cal.csv'
    if os.path.exists(filename):
        with open(filename) as file_in:
            raw_df = pd.read_csv(file_in, delimiter='\t', header=None, names=call_cols)
            cal_df = process_cal(raw_df, subj)
            pid_coe = coe_df.loc[coe_df['pid'] == subj]
            phone_df = pid_coe.loc[pid_coe['comm_type'] == 'PHONE']
            
            return match_dates(phone_df, cal_df)

In [247]:
coe_df = pickle.load(open('../data/top_5_contacts_loc_final.df', 'rb'))

In [256]:
loc_coe_df = pd.DataFrame()
data_dir = '../CS120/CS120-sensor-csvs/'

subjects = os.listdir(data_dir)
call_cols = ['timestamp', 'call_state']

coe_df = coe_df.drop_duplicates(subset=['date', 'pid', 'combined_hash'])

cal_df = pd.DataFrame(columns=call_cols)

func_args = [(data_dir, subj) for subj in subjects]

with multiprocessing.Pool(processes=4) as pool:
    results = pool.starmap(extract_cal_data, func_args)

final_df = pd.DataFrame()
for df in results:
    final_df = final_df.append(df)


In [257]:
final_df.shape

(15233, 4)

In [258]:
final_df.head()

Unnamed: 0,combined_hash,timestamp,ring_duration,call_duration
0,1002060b07d836f246e50ce930bf90a9fe33939,1446589000.0,13.0,148.0
1,1002060b07d836f246e50ce930bf90a9fe33939,1446589000.0,6.0,354.0
2,1002060b07d836f246e50ce930bf90a9fe33939,1446592000.0,,50.0
3,1002060b07d836f246e50ce930bf90a9fe33939,1446592000.0,7.0,218.0
4,1002060b07d836f246e50ce930bf90a9fe33939,1446750000.0,17.0,53.0


In [259]:
pickle.dump(final_df, open("all_cal.df", "wb"), -1)

In [260]:
coe_df.shape

(283624, 40)

In [261]:
coe_cal_df = coe_df.merge(final_df, on=['combined_hash', 'timestamp'], how='outer')

In [267]:
((coe_cal_df['comm_type'] == 'PHONE') & (coe_cal_df['comm_direction'] == 'OUTGOING')).sum()

12089

In [268]:
pickle.dump(coe_cal_df, open("top_5_contacts_cal.df", "wb"), -1)