In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
import pickle
import scipy.sparse as sp

## Run it once to combine phone brand and model

In [50]:
out = open('data/phone_model_device.csv', 'w+')
out.write('device_id,phone_model\n')
with open('data/phone_brand_device_model.csv') as f:
    line = 'start'
    while True:
        line = f.readline()
        if line == '':
            break
        if line.split(',')[0] == 'device_id':
            continue
        device_id = line.split(',')[0]
        phone = line.split(',')[1]
        model = line.split(',')[2]
        out.write(device_id)
        out.write(',')
        out.write(phone+' '+model)
out.close()

## Preprocess (drop GPS data)

In [3]:
app_event = pd.read_csv('data/app_events.csv', usecols=['event_id','app_id','is_installed', 'is_active'], dtype={'is_active':bool, 'is_installed':bool})

In [28]:
app_event.head()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,True,True
1,2,-5720078949152207372,True,False
2,2,-1633887856876571208,True,False
3,2,-653184325010919369,True,True
4,2,8693964245073640147,True,True


In [4]:
events = pd.read_csv('data/events.csv', parse_dates=['timestamp'], index_col='event_id')
# events = pd.read_csv('data/events.csv', parse_dates=['timestamp'])

  mask |= (ar1 == a)


In [5]:
events.drop(['timestamp', 'longitude', 'latitude'], inplace=True, axis=1)

In [31]:
events.head()

Unnamed: 0_level_0,device_id
event_id,Unnamed: 1_level_1
1,29182687948017175
2,-6401643145415154744
3,-4833982096941402721
4,-6815121365017318426
5,-5373797595892518570


In [6]:
app_device = app_event.join(events, on='event_id', how='left', rsuffix='_r')

In [7]:
app_device.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,device_id
0,2,5927333115845830913,True,True,-6401643145415154744
1,2,-5720078949152207372,True,False,-6401643145415154744
2,2,-1633887856876571208,True,False,-6401643145415154744
3,2,-653184325010919369,True,True,-6401643145415154744
4,2,8693964245073640147,True,True,-6401643145415154744


## Preprocess (having GPS)

In [53]:
app_event = pd.read_csv('data/app_events.csv', usecols=['event_id','app_id','is_installed', 'is_active'], dtype={'is_active':bool, 'is_installed':bool})
events = pd.read_csv('data/events.csv', parse_dates=['timestamp'], index_col='event_id')
events.drop('timestamp', inplace=True, axis=1)

  mask |= (ar1 == a)


In [16]:
events.head()

Unnamed: 0_level_0,device_id,GPS
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,29182687948017175,121.4 31.2
2,-6401643145415154744,103.7 31.0
3,-4833982096941402721,106.6 29.7
4,-6815121365017318426,104.3 23.3
5,-5373797595892518570,115.9 28.7


In [61]:
len(events['device_id'].unique())

29591

In [56]:
events['longitude'] = events['longitude'].apply(lambda x: round(x, 1))
events['latitude'] = events['latitude'].apply(lambda x: round(x, 1))

In [57]:
events['longitude'] = events[(events['longitude'] >= 75.9) & (events['longitude'] <= 134.2)]

In [58]:
events['latitude'] = events[(events['latitude'] >= 18.2) & (events['latitude'] <= 52.3)]

In [60]:
events.dropna(inplace=True)

In [8]:
events.rename({'longitude': 'GPS'}, axis=1, inplace=True)

In [13]:
events['GPS'] = events['GPS'].astype(str) + ' ' + events['latitude'].astype(str)

In [15]:
events.drop('latitude', axis=1, inplace=True)

In [19]:
app_device = app_event.join(events, on='event_id', how='left', rsuffix='_r')

In [20]:
app_device.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,device_id,GPS
0,2,5927333115845830913,True,True,-6401643145415154744,103.7 31.0
1,2,-5720078949152207372,True,False,-6401643145415154744,103.7 31.0
2,2,-1633887856876571208,True,False,-6401643145415154744,103.7 31.0
3,2,-653184325010919369,True,True,-6401643145415154744,103.7 31.0
4,2,8693964245073640147,True,True,-6401643145415154744,103.7 31.0


## Get labels (do not run if already have data/labels.csv)

In [21]:
label = pd.read_csv('data/gender_age_train.csv', index_col='device_id')

In [22]:
label.drop(axis=1, labels=['gender', 'age'], inplace=True)

In [23]:
label['group'] = label['group'].astype('category').cat.codes

In [24]:
label = label.to_dict()

In [128]:
for i in list(label['group']):
    if i not in idx_map:
        del label['group'][i]

In [136]:
labels = pd.DataFrame.from_dict(label)
labels.to_csv('data/labels.csv')

## Get Adjacency Matrix for Device_id and Phone

In [8]:
labels = pd.read_csv('data/labels.csv')

In [9]:
labels.rename({'Unnamed: 0': 'device_id'}, axis=1, inplace=True)

In [10]:
labels.head()

Unnamed: 0,device_id,group
0,-9222956879900151005,10
1,-9221026417907250887,3
2,-9220830859283101130,6
3,-9220061629197656378,11
4,-9218960997324667698,5


In [11]:
phone_brand = pd.read_csv('data/phone_model_device.csv')

In [12]:
labeled_device_id = list(labels['device_id'])

In [162]:
n_train = math.ceil(len(labeled_device_id)/10.0 * 8)
n_test = len(labeled_device_id) - train_idx

In [163]:
train_idx = range(n_train)
test_idx = range(n_train, n_train+n_test)

In [13]:
all_id = set(app_device['device_id']).union(set(labeled_device_id))

In [14]:
unlabeled_id = all_id - set(labeled_device_id)

In [15]:
# this step is to have all labeled device_id on the top of all node_id to fit the train_idx
# and test_idx
all_id_list = np.concatenate((np.array(labeled_device_id, dtype=np.int64),
                              np.array(list(unlabeled_id), dtype=np.int64)))

In [16]:
n_node = len(all_id_list)

In [17]:
idx_map = {j: i for i, j in enumerate(all_id_list)}

In [18]:
phone_start = len(idx_map)

In [19]:
phone_map = {j: i+phone_start for i, j in enumerate(set(phone_brand['phone_model'].values))}

In [20]:
app_start = phone_start + len(phone_map)

In [21]:
app_map = {j: i+app_start for i, j in enumerate(set(app_device['app_id'].values))}

In [46]:
#with GPS
GPS_start = app_start + len(app_map)
GPS_map = {j: i+GPS_start for i, j in enumerate(set(app_device['GPS'].values))}

In [22]:
total_node = n_node + len(phone_map) +len(app_map)

In [23]:
total_node

81726

In [190]:
from_idx = []
to_idx = []
for device in idx_map:
    if device in phone_brand['device_id'].values:
        from_idx.append(idx_map[device])
        to_idx.append(phone_map.get(phone_brand[phone_brand['device_id']==device].phone_model.values[0]))
    
from_idx = np.array(from_idx, dtype=np.int64)
to_idx = np.array(to_idx, dtype=np.int64)

In [197]:
total_node

81726

In [198]:
adj_phone = sp.coo_matrix((np.ones(len(from_idx)), (from_idx, to_idx)), shape=(total_node, total_node), dtype=np.int32)

In [199]:
sp.save_npz('data/adj_phone', adj_phone)

## Get Adjacency Matrix for Device_id and App_id if is_installed and is_active

In [201]:
is_installed_from_idx = []
is_installed_to_idx = []
is_active_from_idx = []
is_active_to_idx = []
for device in idx_map:
    if device in app_device['device_id'].values:
        df = app_device[app_device['device_id']==device]
        all_apps = df[df.is_installed].app_id.values
        for app_id in all_apps:
            is_installed_from_idx.append(idx_map[device])
            is_installed_to_idx.append(app_map.get(app_id))
        
        all_apps = df[df.is_active].app_id.values
        for app_id in all_apps:
            is_active_from_idx.append(idx_map[device])
            is_active_to_idx.append(app_map.get(app_id))
        
is_installed_from_idx = np.array(from_idx, dtype=np.int64)
is_installed_to_idx = np.array(to_idx, dtype=np.int64)
is_active_from_idx = np.array(from_idx, dtype=np.int64)
is_active_to_idx = np.array(to_idx, dtype=np.int64)

In [202]:
app_installed = sp.coo_matrix((np.ones(len(is_installed_from_idx)), (is_installed_from_idx, is_installed_to_idx)), shape=(total_node, total_node), dtype=np.int32)
app_active = sp.coo_matrix((np.ones(len(is_active_from_idx)), (is_active_from_idx, is_active_to_idx)), shape=(total_node, total_node), dtype=np.int32)

In [203]:
sp.save_npz('data/adj_app_installed', app_installed)
sp.save_npz('data/adj_app_active', app_active)

In [207]:
import gc
gc.collect()

0

In [205]:
with open('data/train_idx', 'wb') as f:
    pickle.dump(list(train_idx), f)

with open('data/test_idx', 'wb') as f:
    pickle.dump(list(test_idx), f)

In [206]:
app_installed

<81726x81726 sparse matrix of type '<class 'numpy.int32'>'
	with 28979178 stored elements in COOrdinate format>