In [331]:
%matplotlib inline
from sensible_raw.loaders import loader
import pandas as pd
from collections import defaultdict
import matplotlib.pylab as plt
import seaborn as sns
import subprocess
import os
import json
import datetime
from datetime import datetime as dt
import numpy as np

In [155]:
# Load data
columns_sms, data_sms = loader.load_data("sms", "february_2014")

dict_tmp = {}
for column, array in zip(columns_sms, data_sms):
    dict_tmp[column] = array
df_sms = pd.DataFrame(dict_tmp)

In [156]:
# Filter out non-delivered messages
df_sms = df_sms[df_sms['status'] <= 0]

In [118]:
# Declare typical class hours
typical_class_hours = range(8,10) + range(13,15)

# Feature extraction from sms data

### Features to extract
    
    1. Outward sociability (daily number of sms'es sent)
    2. Inward sociability (daily number of sms'es received)
    3. Outward sociability in typical class hours
    4. Overall responsiveness to messages
    5. Overall received responsiveness to messages
    6. Selectivity in responsiveness
    7. Fractions of conversations started
    8. Non-study outgoing messaging fraction

### 1-3

In [509]:
def compute_outward_sociability(user):
    outgoing_messages = df_sms[df_sms['user']==user][df_sms['type']==2]
    return {'outward_sociability': len(outgoing_messages)}

def compute_inward_sociability(user):
    ingoing_messages = df_sms[df_sms['user']==user][df_sms['type']==1]
    return {'inward_sociability': len(ingoing_messages)}

def compute_outward_sociability_in_typical_class_hours(user):
    outgoing_messages_timestamps = df_sms[df_sms['user']==user][df_sms['type']==2]['timestamp']
    outgoing_messages_timestamps_in_typical_class_hours = filter(
        lambda x: dt.fromtimestamp(x/1000).hour
        in typical_class_hours, outgoing_messages_timestamps)
    
    return {'outward_sociability_in_typical_class_hours': len(outgoing_messages_timestamps_in_typical_class_hours)}

### 4 - 7
#### Strategy:
    1. Break data into to conversations (using critical time seperation)
    2. Treat each conversation as a two-noded graph. Directed links correspond to messages, and their lengths correspond to response time
    3. Compute average response time, and response time experiences on each conversation.
    4. Compute Responsiveness as the average response time and std. of Responsiveness from this.

In [451]:
def compute_conversation_response_times(conversation):
    conversation = conversation.sort(['timestamp'],ascending=[1])
    
    response_times = defaultdict(list)
    
    sender_prev = {}
    sender = {}
    
    for i, message in enumerate(conversation.iterrows()):
        
        if i == 0:
            sender_prev = {'actor': message[1]['type'],
                        'timestamp': message[1]['timestamp']}
            continue
        
        sender = {'actor': message[1]['type'],
                  'timestamp': message[1]['timestamp']}
        
        if sender['actor'] == sender_prev['actor']:
            continue
            
        response_time = (sender['timestamp'] - sender_prev['timestamp'])/1000
        
        response_times[sender['actor']].append(response_time)
        
        sender_prev = {'actor': message[1]['type'],
                       'timestamp': message[1]['timestamp']}

    response_time_user = np.mean(response_times[2])
    response_time_conserver = np.mean(response_times[1])
    
    return response_time_user, response_time_conserver

In [398]:
def partition_dyad_messages_to_conversations(messages, expiration_time):
    """Used inside compute_user_conversations_response_times only"""
    conversations = []
    
    timestamps = sorted(messages['timestamp']/1000)
    
    conversation_breaks = []
    for i, _ in enumerate(timestamps):
        if i == 0:
            continue
        delta_t = timestamps[i] - timestamps[i-1]
        
        if delta_t > expiration_time * 3600:
            start_break = int(timestamps[i-1])
            end_break = int(timestamps[i])
            conversation_breaks.append([start_break, end_break])
            
    conversations = []
    
    if len(conversation_breaks) == 0:
        conversations.append(messages)
        return conversations
    
    for i, _ in enumerate(conversation_breaks):
        if i == 0:
            conv_end = conversation_breaks[i][0]
            conversation = messages[
                messages['timestamp']/1000 <= conv_end]
            conversations.append(conversation)
            continue
        if i == len(conversation_breaks)-1:
            conv_start = conversation_breaks[i][1]
            conversation = messages[
                messages['timestamp']/1000 >= conv_start]
            conversations.append(conversation)
            continue
        
        conv_start = conversation_breaks[i-1][1]
        conv_end = conversation_breaks[i][0]
        conversation = messages[
            messages['timestamp']/1000 >= conv_start][
            messages['timestamp']/1000 <= conv_end]
        
        conversations.append(conversation)
        
    return conversations

In [531]:
def compute_user_conversations(user,expiration_time=6):
    
    user_messages = df_sms[df_sms['user']==user]
    
    # the people that the user texts with
    conservers = set(user_messages['address'])
    
    user_conversations = {}
    
    for c in conservers:
        c_messages = user_messages[user_messages['address'] == c]
        
        c_conversations = partition_dyad_messages_to_conversations(c_messages,expiration_time)
        
        user_conversations[c] = c_conversations
        
    return user_conversations

In [532]:
def compute_features_4_to_7(user):
    user_conversations = compute_user_conversations(user)
    
    converser_response_times = {}
    conversations_started = 0
    conversations_count = 0
    
    i = 0
    for converser, conversations in user_conversations.items():
        
        # compute response times
        response_times = [compute_conversation_response_times(c) for c in conversations] #[(907.0, 36.0), (nan, nan), (239.0, 205.0), (140.0, 50.0)]
        
        average_outward = np.mean([r[0] for r in response_times])
        average_inward = np.mean([r[1] for r in response_times])
        
        converser_response_times[converser] = {'average_outward': average_outward, 'average_inward': average_inward}
        
        conversations_started += len([c for c in conversations if list(c['type'])[0] == 2])
        conversations_count += len(conversations)
        
        #i += 1
        #if i > 5:
        #    return converser_response_times
        
    # compute feature 4
    responsiveness_list = [v['average_outward'] for k,v in converser_response_times.items()
                           if not np.isnan(v['average_outward'])]
    
    overall_responsiveness = np.mean(responsiveness_list)
    
    # compute feature 5
    overall_received_responsiveness = np.mean(
        [v['average_inward'] for k,v in converser_response_times.items() 
         if not np.isnan(v['average_inward'])])
    
    # compute feature 6
    selectivity_in_responsiveness = np.std(responsiveness_list)
    
    # compute feature 7
    fractions_of_conversations_started = conversations_started * 1.0/conversations_count
    
    return {'overall_responsiveness': overall_responsiveness, 
            'overall_received_responsiveness': overall_received_responsiveness, 
            'selectivity_in_responsiveness': selectivity_in_responsiveness, 
            'fractions_of_conversations_started': fractions_of_conversations_started}

compute_features_4_to_7(166)


    

{'fractions_of_conversations_started': 0.49390243902439024,
 'overall_received_responsiveness': 2156.9962962962968,
 'overall_responsiveness': 1480.9791666666667,
 'selectivity_in_responsiveness': 2589.5229294854466}

#### 8

In [521]:
def compute_non_study_outgoing_messaging_fraction(user):
    user_messages = df_sms[df_sms['user']==user]
    outgoing_messages = user_messages[
        user_messages['type'] == 2]
    non_study_outgoing_messages = outgoing_messages[
        user_messages['address'] > max(df_sms['user'])]
    
    return len(non_study_outgoing_messages) * 1.0/len(outgoing_messages)
    

In [527]:
compute_non_study_outgoing_messaging_fraction(4)

0.6592920353982301

In [534]:
tmp = compute_user_conversations(101)

In [543]:
tmp.items()[4][1]

[        address  status      timestamp  type  user
 252558    48431      -1  1393500613000     1   101
 252559    48431      -1  1393509668000     2   101
 252560    48431      -1  1393516294000     1   101
 252561    48431      -1  1393525972000     2   101
 252562    48431      -1  1393534882000     2   101
 252563    48431      -1  1393535084000     1   101
 252564    48431      -1  1393539784000     2   101
 252565    48431      -1  1393540592000     1   101
 252566    48431      -1  1393540966000     2   101
 252567    48431      -1  1393541018000     1   101
 252568    48431      -1  1393541058000     2   101
 252569    48431      -1  1393541103000     1   101
 252570    48431      -1  1393542246000     2   101,
         address  status      timestamp  type  user
 272976    48431      -1  1393608393000     1   101
 272977    48431      -1  1393608530000     2   101
 272978    48431      -1  1393608710000     1   101
 272979    48431      -1  1393608765000     2   101
 272980    