In [2]:
import pandas as pd
import json
import numpy as np
from pandas.io.json import json_normalize
from scipy.spatial.distance import cosine
import csv
import ast
import os

In [3]:
# loading the raw data
df = pd.read_csv('../../data/CDR/hash/sample.csv') 
df.columns = ['index','time','source','dest','call']
df.index = df.source

In [4]:
df.head()

Unnamed: 0_level_0,index,time,source,dest,call
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7853,11340721,1383327600000,7853,1005,0.000323
7853,11340722,1383327600000,7853,1012,0.000116
7853,11340723,1383327600000,7853,1103,0.000576
7853,11340724,1383327600000,7853,1117,0.000427
7853,11340725,1383292200000,7853,1131,0.000671


In [5]:

# loading the region-cell data
table = pd.read_csv('../../data/CDR/hash/intersect.csv', header = None) 
table.columns = ['region', 'proportions']
table.index = table.region
table.sort_values(['region'], inplace=True)

# loading the cell-proportion data
prop_table = pd.read_csv('../../data/CDR/hash/cell_intersect.csv', header = None) 
prop_table.columns = ['cell', 'proportions']
prop_table.index = prop_table.cell
prop_table.sort_values(['cell'], inplace=True)


In [44]:
prop_table.head()

Unnamed: 0_level_0,cell,proportions
cell,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,{}
2,2,{}
3,3,{}
4,4,{}
5,5,{}


In [96]:


def get_cells_per_region(table, region_id):
    ids = table.iloc[region_id].proportions
    ids = ast.literal_eval(table.get_value(region_id, "proportions"))
    return ids.keys()

def get_call_data(source, dest, df):        
    source_dict = get_cells_per_region(table, source)
    dest_dict = get_cells_per_region(table, dest)

    subset = df[df.index.isin(source_dict)]
    subset.index = subset.dest
    subset = subset[subset.index.isin(dest_dict)]
            
    return subset

def calculate_actual_call(s_cell, d_cell, call, s_region, d_region):
    """
        Create another column on the subset DataFrame that is proportional to the regions.
    """
    source_prop = ast.literal_eval(prop_table.get_value(s_cell, "proportions"))
    dest_prop = ast.literal_eval(prop_table.get_value(d_cell, "proportions"))

    try:
        final = source_prop[str(s_region)] * dest_prop[str(d_region)] * call
    except:
        final = 0
    
    return final

In [109]:
region_network = pd.DataFrame(columns=['time','source_region','dest_region','adjusted_call'])



for s in range(1,81):
    for d in range(1,81):
        # get a subset of records for the source and dest
        subdf = get_call_data(s, d)        
        subdf["source_region"] = s
        subdf["dest_region"] = d
        print (s, d)
        # create a column with adjusted call values
        try:
            subdf["adjusted_call"] = np.vectorize(calculate_actual_call)(subdf["source"], subdf["dest"], subdf["call"], subdf["source_region"], subdf["dest_region"])
        except:
            print ("error in making a column...")
            continue

        # do aggregation for 
        subdf = subdf.groupby("time").agg({
                    "source_region": "first",
                    "dest_region": "first",               
                    "adjusted_call": "sum"
                })
        region_network = region_network.append(subdf)


region_network.time = region_network.index        
output_filename = '../../data/CDR/generated/region_network.csv'
region_network.to_csv(output_filename, encoding='utf-8', index=False)

/Users/myeong/git/DSSG/DSSG2016-SensingTheCensus/src/notebooks/cdr/1.txt
/Users/myeong/git/DSSG/DSSG2016-SensingTheCensus/src/notebooks/cdr/2.txt
