In [1]:
%matplotlib inline

import sys
sys.path.append('../python/')

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import imp
import cdr
import json
import os.path
from scipy.spatial.distance import cosine
from pandas.io.json import json_normalize

imp.reload(cdr)

print ("Libraries loaded")

dfs = pd.DataFrame(columns=["cellId", "time", "countryCode", "smsIn", "smsOut","callIn", "callOut", "internet"])

# Read all the files in Milano
for month in {"11","12"}:
    for day in range(1, 32):
        to_read = '../../data/CDR/sms-call-internet-mi-2013-' + month + '-' +\
                    str(day).zfill(2) + '.txt'
        
        if os.path.isfile(to_read):
            temp = pd.read_csv(to_read, delimiter='\t', header=None)
            temp.columns = ["cellId", "time", "countryCode", "smsIn", "smsOut","callIn", "callOut", "internet"]
            temp = temp[temp['countryCode'] != 0]
            temp = temp.groupby(['cellId','time'], as_index=False).agg({                        
                    'smsIn': 'sum',
                    'smsOut': 'sum',
                    'callIn': 'sum',
                    'callOut': 'sum',
                    'internet': 'sum'
                })
            dfs = dfs.append(temp)
            print ("loaded ", to_read)

print ("Milano files loaded")

# dfs['1101'] = pd.read_csv('../../data/CDR/sms-call-internet-mi-2013-11-01.txt', delimiter='\t', header=None) 
# file = '../../data/GeoJSON/milano-grid.geojson'
# with open(file) as f:
#     grid = pd.read_json(f, typ='Series')

# print ("Grid file loaded")



Libraries loaded
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-01.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-02.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-03.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-04.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-05.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-06.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-07.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-08.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-09.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-10.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-11.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-12.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-13.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-14.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-15.txt
loaded  ../../data/CDR/sms-call-internet-mi-2013-12-16.txt
loaded  ../../data/CDR/sms-call-interne

In [2]:

dfs.index = dfs.cellId
dfs.fillna(0, inplace=True)

In [3]:
# loading the region-cell data
table = pd.read_csv('../../data/CDR/hash/intersect.csv', header = None) 
table.columns = ['region', 'proportions']
table.index = table.region
table.sort_values(['region'], inplace=True)

# loading the cell-proportion data
prop_table = pd.read_csv('../../data/CDR/hash/cell_intersect.csv', header = None) 
prop_table.columns = ['cell', 'proportions']
prop_table.index = prop_table.cell
prop_table.sort_values(['cell'], inplace=True)

def get_cells_per_region(table, region_id):
    ids = table.iloc[region_id].proportions
    ids = ast.literal_eval(table.get_value(region_id, "proportions"))
    return ids.keys()

def get_call_data(region, df):        
    r_dict = get_cells_per_region(table, region)

    subset = df[df.index.isin(r_dict)]
            
    return subset

def calculate_actual_call(cell_id, region_id, call):
    """
        Create another column on the subset DataFrame that is proportional to the regions.
    """
    prop = ast.literal_eval(prop_table.get_value(int(cell_id), "proportions"))

    try:
        final = prop[str(region_id)] * float(call)
    except:
        final = 0
    
    return final

In [4]:
# dfs['1101'].fillna(0, inplace=True)
# dfs['1101'].columns = ["cellId", "time", "countryCode", "smsIn", "smsOut",
#                    "callIn", "callOut", "internet"]

# dfs['1101'] = dfs['1101'][dfs['1101']['countryCode'] != 0]
# dfs['1101'].index = dfs['1101'].cellId

In [5]:
dfs.head()

Unnamed: 0_level_0,callIn,callOut,cellId,countryCode,internet,smsIn,smsOut,time
cellId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,0.109202,0.164427,1.0,0,13.674575,0.110989,0.166214,1385852000000.0
1.0,0.030875,0.0273,1.0,0,13.330858,0.165137,0.176399,1385853000000.0
1.0,0.054601,0.0,1.0,0,11.329552,0.186451,0.136588,1385854000000.0
1.0,0.082526,0.135964,1.0,0,13.166163,0.219652,0.381129,1385854000000.0
1.0,0.054601,0.079575,1.0,0,13.347791,0.295114,0.11045,1385855000000.0


In [6]:
tp = dfs

In [7]:
# change miliseconds to datetime
tp.index = pd.to_datetime(tp['time'],unit='ms',utc=True)
tp.index = tp.index.tz_localize('UTC').tz_convert('Europe/Rome')
tp['date'] = tp.index
tp['time_hour'] = tp.index.hour
tp['month'] = tp.index.month
tp['day'] = tp.index.day

In [8]:
tp = tp.groupby(['cellId','month','day','time_hour'], as_index=False).agg({     
            'time': 'min',
            'smsIn': 'sum',
            'smsOut': 'sum',
            'callIn': 'sum',
            'callOut': 'sum',
            'internet': 'sum'
        })
tp.index = tp.cellId

In [9]:
pd.options.mode.chained_assignment = None



In [10]:
region_based = pd.DataFrame(columns=['region_id','time','adjusted_smsIn','adjusted_smsOut','adjusted_callIn','adjusted_callOut','adjusted_internet'])
import ast
import numpy as np
for re in range(1, 86):
    print(re)
    subdf = get_call_data(re, tp)
    subdf["region_id"] = re
    try:
        subdf["adjusted_smsIn"] = np.vectorize(calculate_actual_call)(subdf["cellId"], subdf["region_id"], subdf["smsIn"])
        subdf["adjusted_smsOut"] = np.vectorize(calculate_actual_call)(subdf["cellId"], subdf["region_id"], subdf["smsOut"])
        subdf["adjusted_callIn"] = np.vectorize(calculate_actual_call)(subdf["cellId"], subdf["region_id"], subdf["callIn"])
        subdf["adjusted_callOut"] = np.vectorize(calculate_actual_call)(subdf["cellId"], subdf["region_id"], subdf["callOut"])
        subdf["adjusted_internet"] = np.vectorize(calculate_actual_call)(subdf["cellId"], subdf["region_id"], subdf["internet"])
    except:
        print ("error in making a column...")
        continue
        
    # do aggregation for 
    subdf = subdf.groupby("time").agg({
                'region_id': 'first',
                'adjusted_smsIn': 'sum',
                'adjusted_smsOut': 'sum',
                'adjusted_callIn': 'sum',
                'adjusted_callOut': 'sum',
                'adjusted_internet': 'sum'
            })
    
    region_based = region_based.append(subdf)

region_based.time = region_based.index


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85


IndexError: single positional indexer is out-of-bounds

In [None]:
internet = region_based.pivot(index='region_id', columns='time', values='adjusted_internet')

In [None]:
internet.fillna(0, inplace=True)

In [None]:
internet.head()

In [None]:
x = internet.as_matrix()

In [None]:
mean = np.mean(internet, 0)
mean.count()

In [None]:
x_prime = np.zeros(shape=(85,mean.count()))

In [None]:
for i in range(0, 85):
    x_prime[i] = x[i] - mean


In [None]:
x_prime = np.transpose(x_prime)
len(x_prime)

In [None]:
xpp = pd.DataFrame(x_prime)
corr = xpp.corr('pearson')
corr.index = corr.index + 1
corr.columns = range(1,86)

In [None]:
corr.head()

In [None]:
f = plt.figure(figsize=(12, 20))
gs = plt.GridSpec(15, 1)
map_ax = f.add_subplot(gs[0:-5])
sns.heatmap(corr, ax=map_ax,
            cbar_kws={"orientation": "horizontal"})
map_ax.set(xlabel="Calls");

In [None]:
callIn = region_based.pivot(index='region_id', columns='time', values='adjusted_callIn')
callIn.fillna(0, inplace=True)
x = callIn.as_matrix()
mean = np.mean(callIn, 0)
x_prime = np.zeros(shape=(85, mean.count()))
for i in range(0, 85):
    x_prime[i] = x[i] - mean
x_prime = np.transpose(x_prime)

xpp = pd.DataFrame(x_prime)
corr = xpp.corr('pearson')
corr.index = corr.index + 1
corr.columns = range(1,86)

f = plt.figure(figsize=(12, 20))
gs = plt.GridSpec(15, 1)
map_ax = f.add_subplot(gs[0:-5])
sns.heatmap(corr, ax=map_ax,
            cbar_kws={"orientation": "horizontal"})
map_ax.set(xlabel="Call-In");