In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
import pickle
import scipy.sparse as sp

In [3]:
from sklearn.utils import shuffle

## Run it once to combine phone brand and model

In [50]:
out = open('data/phone_model_device.csv', 'w+')
out.write('device_id,phone_model\n')
with open('data/phone_brand_device_model.csv') as f:
    line = 'start'
    while True:
        line = f.readline()
        if line == '':
            break
        if line.split(',')[0] == 'device_id':
            continue
        device_id = line.split(',')[0]
        phone = line.split(',')[1]
        model = line.split(',')[2]
        out.write(device_id)
        out.write(',')
        out.write(phone+' '+model)
out.close()

## Preprocess (drop GPS data)

In [5]:
app_event = pd.read_csv('data/app_events.csv', usecols=['event_id','app_id','is_installed', 'is_active'], dtype={'is_active':bool, 'is_installed':bool})

In [6]:
app_event.head()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,True,True
1,2,-5720078949152207372,True,False
2,2,-1633887856876571208,True,False
3,2,-653184325010919369,True,True
4,2,8693964245073640147,True,True


In [7]:
events = pd.read_csv('data/events.csv', parse_dates=['timestamp'], index_col='event_id')
# events = pd.read_csv('data/events.csv', parse_dates=['timestamp'])

  mask |= (ar1 == a)


In [8]:
events.drop(['timestamp', 'longitude', 'latitude'], inplace=True, axis=1)

In [9]:
events.head()

Unnamed: 0_level_0,device_id
event_id,Unnamed: 1_level_1
1,29182687948017175
2,-6401643145415154744
3,-4833982096941402721
4,-6815121365017318426
5,-5373797595892518570


In [10]:
app_device = app_event.join(events, on='event_id', how='left', rsuffix='_r')

In [11]:
app_device.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,device_id
0,2,5927333115845830913,True,True,-6401643145415154744
1,2,-5720078949152207372,True,False,-6401643145415154744
2,2,-1633887856876571208,True,False,-6401643145415154744
3,2,-653184325010919369,True,True,-6401643145415154744
4,2,8693964245073640147,True,True,-6401643145415154744


## Preprocess (having GPS)

In [42]:
app_event = pd.read_csv('data/app_events.csv', usecols=['event_id','app_id','is_installed', 'is_active'], dtype={'is_active':bool, 'is_installed':bool})
events = pd.read_csv('data/events.csv', parse_dates=['timestamp'], index_col='event_id')
events.drop('timestamp', inplace=True, axis=1)

  mask |= (ar1 == a)


In [43]:
events.head()

Unnamed: 0_level_0,device_id,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,29182687948017175,121.38,31.24
2,-6401643145415154744,103.65,30.97
3,-4833982096941402721,106.6,29.7
4,-6815121365017318426,104.27,23.28
5,-5373797595892518570,115.88,28.66


In [44]:
len(events['device_id'].unique())

60865

In [56]:
events['longitude'] = events['longitude'].apply(lambda x: round(x, 1))
events['latitude'] = events['latitude'].apply(lambda x: round(x, 1))

In [57]:
events['longitude'] = events[(events['longitude'] >= 75.9) & (events['longitude'] <= 134.2)]

In [58]:
events['latitude'] = events[(events['latitude'] >= 18.2) & (events['latitude'] <= 52.3)]

In [60]:
events.dropna(inplace=True)

In [8]:
events.rename({'longitude': 'GPS'}, axis=1, inplace=True)

In [13]:
events['GPS'] = events['GPS'].astype(str) + ' ' + events['latitude'].astype(str)

In [15]:
events.drop('latitude', axis=1, inplace=True)

In [19]:
app_device = app_event.join(events, on='event_id', how='left', rsuffix='_r')

In [20]:
app_device.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,device_id,GPS
0,2,5927333115845830913,True,True,-6401643145415154744,103.7 31.0
1,2,-5720078949152207372,True,False,-6401643145415154744,103.7 31.0
2,2,-1633887856876571208,True,False,-6401643145415154744,103.7 31.0
3,2,-653184325010919369,True,True,-6401643145415154744,103.7 31.0
4,2,8693964245073640147,True,True,-6401643145415154744,103.7 31.0


## Get labels (do not run if already have data/labels.csv)

In [9]:
label = pd.read_csv('data/gender_age_train.csv', index_col='device_id')

In [10]:
label.drop(axis=1, labels=['gender', 'age'], inplace=True)

In [11]:
label['group'] = label['group'].astype('category')

In [90]:
label['group'].cat.categories

Index(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'],
      dtype='object')

In [13]:
label.head()

Unnamed: 0_level_0,group
device_id,Unnamed: 1_level_1
-8076087639492063270,10
-2897161552818060146,10
-8260683887967679142,10
-4938849341048082022,9
245133531816851882,9


In [12]:
label['group'] = label['group'].cat.codes

In [24]:
label = label.to_dict()

In [128]:
for i in list(label['group']):
    if i not in idx_map:
        del label['group'][i]

In [136]:
labels = pd.DataFrame.from_dict(label)
labels.to_csv('data/labels.csv')

## Get Adjacency Matrix for Device_id and Phone

In [12]:
labels = pd.read_csv('data/full_size_labels.csv')

In [13]:
# labels.rename({'Unnamed: 0': 'device_id'}, axis=1, inplace=True)
# if full size labels
labels.drop('Unnamed: 0',axis=1, inplace=True)

In [14]:
labels.head()

Unnamed: 0,device_id,group
0,-2408208566724105772,6
1,-4671788193261470496,2
2,-74515834348903802,6
3,473067357855167588,3
4,4577005331619877737,10


In [15]:
phone_brand = pd.read_csv('data/phone_model_device.csv')

In [16]:
labeled_device_id = list(labels['device_id'])

In [17]:
n_train = math.ceil(len(labeled_device_id)/10.0 * 8)
n_test = len(labeled_device_id) - n_train

In [18]:
train_idx = range(n_train)
test_idx = range(n_train, n_train+n_test)

In [17]:
all_id = set(app_device['device_id'].values).union(set(labeled_device_id))

# for having all nodes included even without app information
all_id = all_id.union(set(phone_brand['device_id'].values))

In [18]:
unlabeled_id = all_id - set(labeled_device_id)

In [19]:
# this step is to have all labeled device_id on the top of all node_id to fit the train_idx
# and test_idx
all_id_list = np.concatenate((np.array(labeled_device_id, dtype=np.int64),
                              np.array(list(unlabeled_id), dtype=np.int64)))

In [20]:
n_node = len(all_id_list)

In [21]:
with open('data/full_size_train_idx', 'rb') as f:
    full_size_train_idx = pickle.load(f)
    
with open('data/full_size_test_idx', 'rb') as f:
    full_size_test_idx = pickle.load(f)

In [22]:
full_size_train_device = []
for i in full_size_train_idx:
    full_size_train_device.append(all_id_list[i])

full_size_test_device = []
for i in full_size_test_idx:
    full_size_test_device.append(all_id_list[i])

In [34]:
with open('data/full_size_train_device', 'wb') as f:
    pickle.dump(full_size_train_device, f)
    
with open('data/full_size_test_device', 'wb') as f:
    pickle.dump(full_size_test_device, f)

In [24]:
idx_map = {j: i for i, j in enumerate(all_id_list)}

In [25]:
phone_start = len(idx_map)

In [26]:
phones = set(phone_brand['phone_model'].values)

In [27]:
phone_map = {j: i+phone_start for i, j in enumerate(phones)}

In [28]:
app_start = phone_start + len(phone_map)

In [29]:
apps = set(app_device['app_id'].values)

In [30]:
app_map = {j: i+app_start for i, j in enumerate(apps)}

In [46]:
#with GPS
GPS_start = app_start + len(app_map)
GPS_map = {j: i+GPS_start for i, j in enumerate(set(app_device['GPS'].values))}

In [31]:
total_node = n_node + len(phone_map) +len(app_map)

In [32]:
total_node

209980

In [None]:
phone

In [52]:
from_idx = []
to_idx = []
for device in idx_map:
    df = None
    all_phones = None
    if device in phone_brand['device_id'].values:
        df = phone_brand[phone_brand['device_id']==device]
        all_phones = df.phone_model.values
        for phone_id in all_phones:
            from_idx.append(idx_map[device])
            to_idx.append(phone_map.get(phone_id))
    
from_idx = np.array(from_idx, dtype=np.int64)
to_idx = np.array(to_idx, dtype=np.int64)

In [53]:
adj_phone = sp.coo_matrix((np.ones(len(from_idx)), (from_idx, to_idx)), shape=(total_node, total_node), dtype=np.int32)

In [54]:
adj_phone

<209980x209980 sparse matrix of type '<class 'numpy.int32'>'
	with 187245 stored elements in COOrdinate format>

In [55]:
sp.save_npz('data/full_size_adj_phone', adj_phone)

## Get Adjacency Matrix for Device_id and App_id if is_installed and is_active

In [96]:
is_installed_from_idx = []
is_installed_to_idx = []
is_active_from_idx = []
is_active_to_idx = []
for device in idx_map:
    if device in app_device['device_id'].values:
        df = app_device[app_device['device_id']==device]
        all_apps = df[df.is_installed].app_id.values
        for app_id in all_apps:
            if app_map.get(app_id) is None:
                print(device)
                print(app_id)
            is_installed_from_idx.append(idx_map[device])
            is_installed_to_idx.append(app_map.get(app_id))
        
        all_apps = df[df.is_active].app_id.values
        for app_id in all_apps:
            is_active_from_idx.append(idx_map[device])
            is_active_to_idx.append(app_map.get(app_id))

-2460817541082341650
1315647097418443009
5191524460861112478
-9215239281845835707
2676705509953163022
2377952478135020944
-8708101143520808336
1672478563154724331
-5620837370408644703
-6029404337627124685
-5979343926158819362
-6966470489827360022


In [36]:
is_installed_from_idx = []
is_installed_to_idx = []
is_active_from_idx = []
is_active_to_idx = []
for device in idx_map:
    df = None
    all_apps = None
    if device in app_device['device_id'].values:
        df = app_device[app_device['device_id']==device]
        all_apps = df[df.is_installed].app_id.values
        for app_id in all_apps:
            if app_map.get(app_id) is None:
                print(device)
                print(app_id)
            is_installed_from_idx.append(idx_map[device])
            is_installed_to_idx.append(app_map.get(app_id))
        
        all_apps = df[df.is_active].app_id.values
        for app_id in all_apps:
            is_active_from_idx.append(idx_map[device])
            is_active_to_idx.append(app_map.get(app_id))

In [37]:
is_installed_from_idx = np.array(is_installed_from_idx, dtype=np.int64)
is_installed_to_idx = np.array(is_installed_to_idx, dtype=np.int64)

In [38]:
is_active_from_idx = np.array(is_active_from_idx, dtype=np.int64)
is_active_to_idx = np.array(is_active_to_idx, dtype=np.int64)

In [39]:
app_installed = sp.coo_matrix((np.ones(len(is_installed_from_idx)), (is_installed_from_idx, is_installed_to_idx)), shape=(total_node, total_node), dtype=np.int32)
app_active = sp.coo_matrix((np.ones(len(is_active_from_idx)), (is_active_from_idx, is_active_to_idx)), shape=(total_node, total_node), dtype=np.int32)

In [43]:
sp.save_npz('data/full_size_adj_app_installed', app_installed)
sp.save_npz('data/full_size_adj_app_active', app_active)

In [41]:
app_installed

<209980x209980 sparse matrix of type '<class 'numpy.int32'>'
	with 32473067 stored elements in COOrdinate format>

In [42]:
app_active

<209980x209980 sparse matrix of type '<class 'numpy.int32'>'
	with 12732996 stored elements in COOrdinate format>

In [40]:
import gc
gc.collect()

713

In [39]:
with open('data/full_size_train_idx', 'wb') as f:
    pickle.dump(list(train_idx), f)

with open('data/full_size_test_idx', 'wb') as f:
    pickle.dump(list(test_idx), f)

In [40]:
labels.to_csv("data/full_size_labels.csv")

In [89]:
app_installed

<81726x81726 sparse matrix of type '<class 'numpy.int32'>'
	with 32473067 stored elements in COOrdinate format>

In [90]:
app_active

<81726x81726 sparse matrix of type '<class 'numpy.int32'>'
	with 12732996 stored elements in COOrdinate format>

In [94]:
adj_phone

<81726x81726 sparse matrix of type '<class 'numpy.int32'>'
	with 58462 stored elements in COOrdinate format>