In [2]:
import numpy as np
import pandas as pd

In [125]:
import pickle
import scipy.sparse as sp

## Run it once to combine phone brand and model

In [50]:
out = open('data/phone_model_device.csv', 'w+')
out.write('device_id,phone_model\n')
with open('data/phone_brand_device_model.csv') as f:
    line = 'start'
    while True:
        line = f.readline()
        if line == '':
            break
        if line.split(',')[0] == 'device_id':
            continue
        device_id = line.split(',')[0]
        phone = line.split(',')[1]
        model = line.split(',')[2]
        out.write(device_id)
        out.write(',')
        out.write(phone+' '+model)
out.close()

## Preprocess

In [16]:
app_event = pd.read_csv('data/app_events.csv', usecols=['event_id','app_id','is_installed', 'is_active'], dtype={'is_active':bool, 'is_installed':bool})

In [17]:
app_event.head()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,True,True
1,2,-5720078949152207372,True,False
2,2,-1633887856876571208,True,False
3,2,-653184325010919369,True,True
4,2,8693964245073640147,True,True


In [33]:
events = pd.read_csv('data/events.csv', parse_dates=['timestamp'], index_col='event_id')
# events = pd.read_csv('data/events.csv', parse_dates=['timestamp'])

  mask |= (ar1 == a)


In [34]:
events.drop(['timestamp', 'longitude', 'latitude'], inplace=True, axis=1)

In [35]:
events.head()

Unnamed: 0_level_0,device_id
event_id,Unnamed: 1_level_1
1,29182687948017175
2,-6401643145415154744
3,-4833982096941402721
4,-6815121365017318426
5,-5373797595892518570


In [36]:
app_device = app_event.join(events, on='event_id', how='left', rsuffix='_r')

In [39]:
app_device.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,device_id
0,2,5927333115845830913,True,True,-6401643145415154744
1,2,-5720078949152207372,True,False,-6401643145415154744
2,2,-1633887856876571208,True,False,-6401643145415154744
3,2,-653184325010919369,True,True,-6401643145415154744
4,2,8693964245073640147,True,True,-6401643145415154744


## Get Adjacency Matrix for Device_id and Phone

In [115]:
phone_brand = pd.read_csv('data/phone_model_device.csv')

In [112]:
device_id = set(app_device['device_id'].values)

In [113]:
idx_map = {j: i for i, j in enumerate(device_id)}

In [114]:
all_device_start = len(idx_map)

In [116]:
phone_map = {j: i+all_device_start for i, j in enumerate(set(phone_brand['phone_model'].values))}

In [119]:
from_idx = []
to_idx = []
for device in idx_map:
    if device in phone_brand['device_id'].values:
        from_idx.append(idx_map[device])
        to_idx.append(phone_map.get(phone_brand[phone_brand['device_id']==device].phone_model.values[0]))
    
from_idx = np.array(from_idx, dtype=np.int64)
to_idx = np.array(to_idx, dtype=np.int64)

In [126]:
adj_phone = sp.coo_matrix((np.ones(len(from_idx)), (from_idx, to_idx)), shape=(len(idx_map)+len(phone_map), len(idx_map)+len(phone_map)), dtype=np.int32)

In [133]:
sp.save_npz('data/adj_phone', adj_phone)

## Get Adjacency Matrix for Device_id and App_id if is_installed

In [144]:
app_map = {j: i+all_device_start for i, j in enumerate(set(app_device['app_id'].values))}

In [145]:
from_idx = []
to_idx = []
for device in idx_map:
    if device in app_device['device_id'].values:
        df = app_device[app_device['device_id']==device]
        all_apps = df[df.is_installed].app_id.values
        for app_id in all_apps:
            from_idx.append(idx_map[device])
            to_idx.append(app_map.get(app_id))
    
from_idx = np.array(from_idx, dtype=np.int64)
to_idx = np.array(to_idx, dtype=np.int64)

In [148]:
app_installed = sp.coo_matrix((np.ones(len(from_idx)), (from_idx, to_idx)), shape=(len(idx_map)+len(app_map), len(idx_map)+len(app_map)), dtype=np.int32)

In [153]:
sp.save_npz('data/adj_app_installed', app_installed)

## Get Adjacency Matrix for Device_id and App_id if is_active

In [155]:
from_idx = []
to_idx = []
for device in idx_map:
    if device in app_device['device_id'].values:
        df = app_device[app_device['device_id']==device]
        all_apps = df[df.is_active].app_id.values
        for app_id in all_apps:
            from_idx.append(idx_map[device])
            to_idx.append(app_map.get(app_id))
    
from_idx = np.array(from_idx, dtype=np.int64)
to_idx = np.array(to_idx, dtype=np.int64)

In [156]:
app_active = sp.coo_matrix((np.ones(len(from_idx)), (from_idx, to_idx)), shape=(len(idx_map)+len(app_map), len(idx_map)+len(app_map)), dtype=np.int32)

In [157]:
sp.save_npz('data/adj_app_active', app_active)

In [158]:
import gc
gc.collect()

153