## Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import Normalizer
from tqdm import tqdm
import math
import numpy as np

## Data Import

In [2]:
df_train = pd.read_pickle("dataset/dataset_train_with_specs.pkl")
df_test = pd.read_pickle("dataset/dataset_test_with_specs.pkl")

In [3]:
print(df_train.columns)

Index(['device_id', 'brand', 'model', 'installed_apps', 'activity', 'gender',
       'age', 'group', 'installed_app_categories', 'app_usage',
       'app_usage_session', 'active_app_categories', 'active_app_usage_time',
       'active_apps', 'activity_hour', 'activity_day', 'num_travels',
       'mean_latitude', 'mean_longitude', 'screen_size', 'ram_gb',
       'release_month', 'release_year', 'camera'],
      dtype='object')


## Data Cleaning

In [4]:
df_train.drop(columns=['activity'],inplace=True)
df_test.drop(columns=['activity'],inplace=True)

In [5]:
tqdm.pandas()
pd.options.mode.chained_assignment = None  # default='warn'

  from pandas import Panel


In [6]:
#returns all the installed apps as a string which can then be onehot encoded
df_train['installed_apps_string'] = df_train.progress_apply(lambda x:" ".join([str(i) for i in x['installed_apps']]), axis=1)
df_test['installed_apps_string'] = df_test.progress_apply(lambda x:" ".join([str(i) for i in x['installed_apps']]), axis=1)
df_train['active_apps_string'] = df_train.progress_apply(lambda x:" ".join([str(i) for i in x['active_apps']]), axis=1)
df_test['active_apps_string'] = df_test.progress_apply(lambda x:" ".join([str(i) for i in x['active_apps']]), axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 74645/74645 [00:02<00:00, 29785.71it/s]
100%|███████████████████████████████████████████████████████████████████████| 112071/112071 [00:02<00:00, 52484.07it/s]
100%|█████████████████████████████████████████████████████████████████████████| 74645/74645 [00:01<00:00, 53833.91it/s]
100%|███████████████████████████████████████████████████████████████████████| 112071/112071 [00:02<00:00, 55322.11it/s]


In [7]:
def specs_present(row):
    """Function that returns a boolean variable that tells whether the specifications of the device are present for the record"""
    if row['screen_size']==0:
        return 0
    else:
        return 1

In [8]:
df_train['specs_available'] = df_train.progress_apply(specs_present, axis=1)
df_test['specs_available'] = df_test.progress_apply(specs_present, axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 74645/74645 [00:00<00:00, 76902.93it/s]
100%|███████████████████████████████████████████████████████████████████████| 112071/112071 [00:01<00:00, 76291.39it/s]


In [9]:
app_labels = pd.read_csv("../dataset/label_categories.csv")

def installed_app_labels(row):
    """Fucntions that returns the labels of the installed apps on the device"""
    
    installed_labels=""
        
    for i in list(row['installed_app_categories']):
        
        installed_labels += str(app_labels.loc[app_labels['label_id']==i,'category'].values[0]) + " "
    
    return installed_labels

def active_app_labels(row):
    """Fucntions that returns the labels of the active apps on the device"""
    
    active_labels=""        
        
    for j in list(row['active_app_categories']):
        active_labels += str(app_labels.loc[app_labels['label_id']==j,'category'].values[0]) + " "
    
    return active_labels

In [10]:
df_train['installed_app_labels'] = df_train.progress_apply(installed_app_labels, axis=1)
df_test['installed_app_labels'] = df_test.progress_apply(installed_app_labels, axis=1)

100%|███████████████████████████████████████████████████████████████████████████| 74645/74645 [06:03<00:00, 205.10it/s]
100%|█████████████████████████████████████████████████████████████████████████| 112071/112071 [09:00<00:00, 207.35it/s]


In [11]:
df_train['active_app_labels'] = df_train.progress_apply(active_app_labels, axis=1)
df_test['active_app_labels'] = df_test.progress_apply(active_app_labels, axis=1)

100%|███████████████████████████████████████████████████████████████████████████| 74645/74645 [02:52<00:00, 431.98it/s]
100%|█████████████████████████████████████████████████████████████████████████| 112071/112071 [04:20<00:00, 429.56it/s]


In [12]:
norm = Normalizer(norm = 'l1')

In [13]:
df_train['activity_hour'] = df_train.progress_apply(lambda x: norm.transform([x['activity_hour']])[0], axis=1)
df_test['activity_hour'] = df_test.progress_apply(lambda x: norm.transform([x['activity_hour']])[0], axis=1)
df_train['activity_day'] = df_train.progress_apply(lambda x: norm.transform([x['activity_day']])[0], axis=1)
df_test['activity_day'] = df_test.progress_apply(lambda x: norm.transform([x['activity_day']])[0], axis=1)

100%|██████████████████████████████████████████████████████████████████████████| 74645/74645 [00:12<00:00, 5891.85it/s]
100%|████████████████████████████████████████████████████████████████████████| 112071/112071 [00:18<00:00, 5925.18it/s]
100%|██████████████████████████████████████████████████████████████████████████| 74645/74645 [00:12<00:00, 5987.08it/s]
100%|████████████████████████████████████████████████████████████████████████| 112071/112071 [00:18<00:00, 5946.81it/s]


In [14]:
def installed_app_onehot(row):
    """Function that returns onehot encoded values of installed apps"""
    counts = {k:0 for k in list(df_train.loc[4]['active_app_usage_time'].keys())}
    
    for i in row['installed_app_categories']:
        if i in counts.keys():
            counts[i] = 1
        else:
            continue
    return list(counts.values())

In [15]:
df_train['installed_app_onehot'] = df_train.progress_apply(installed_app_onehot, axis=1)
df_test['installed_app_onehot'] = df_test.progress_apply(installed_app_onehot, axis=1)

100%|██████████████████████████████████████████████████████████████████████████| 74645/74645 [00:15<00:00, 4883.51it/s]
100%|████████████████████████████████████████████████████████████████████████| 112071/112071 [00:23<00:00, 4845.34it/s]


In [16]:
def active_app_onehot(row):
    """Function that returns onehot encoded values of active apps"""
    
    counts = {k:0 for k in list(df_train.loc[4]['active_app_usage_time'].keys())}
    
    for i in row['active_app_categories']:
        if i in counts.keys():
            counts[i] = 1
        else:
            continue
    return list(counts.values())

In [17]:
df_train['active_app_onehot'] = df_train.progress_apply(active_app_onehot, axis=1)
df_test['active_app_onehot'] = df_test.progress_apply(active_app_onehot, axis=1)

100%|██████████████████████████████████████████████████████████████████████████| 74645/74645 [00:15<00:00, 4829.00it/s]
100%|████████████████████████████████████████████████████████████████████████| 112071/112071 [00:22<00:00, 4891.70it/s]


In [18]:
def time_bin(seq):
    """Function that returns fraction of activity of the particular app category during the four timezones of the day"""
    morning = sum(seq[6:12])                    #6AM to 12PM
    afternoon = sum(seq[12:17])                 #12PM to 5PM
    evening = sum(seq[17:23])                   #5PM to 11 PM
    night = sum(seq[0:6]) + sum(seq[23:])       #11PM to 6AM
            
    usage = [morning, afternoon, evening, night]
    usage = norm.transform([usage])[0]
    
    return usage

In [19]:
def time_bin_app_usage(row):
    """Funtion that returns the active app usage time"""
    
    if row['active_app_usage_time']==0:
        
        time_binned_usage = [0.0 for i in range(4*485)]

    else:
        
        time_binned_usage = []    
    
        for i in row['active_app_usage_time'].keys():
            if sum(row['active_app_usage_time'][i])==0:
                time_binned_usage.extend([0.0,0.0,0.0,0.0])
            else:
                time_binned_usage.extend(time_bin(row['active_app_usage_time'][i]))        

    row['active_app_usage'] = time_binned_usage
        
    return row

In [20]:
df_train = df_train.progress_apply(time_bin_app_usage, axis=1)
df_test = df_test.progress_apply(time_bin_app_usage, axis=1)

100%|███████████████████████████████████████████████████████████████████████████| 74645/74645 [04:17<00:00, 289.65it/s]
100%|█████████████████████████████████████████████████████████████████████████| 112071/112071 [07:20<00:00, 254.56it/s]


In [21]:
def location_outlier(row):
    """Function that tells whether the location of the record is an outlier"""
    
    if (row['mean_latitude'] == 0.0) & (row['mean_longitude'] == 0.0):
        return 0
    else:
        return 1

In [22]:
df_train['location_available'] = df_train.progress_apply(location_outlier, axis=1)
df_test['location_available'] = df_test.progress_apply(location_outlier, axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 74645/74645 [00:02<00:00, 25347.32it/s]
100%|███████████████████████████████████████████████████████████████████████| 112071/112071 [00:02<00:00, 43883.33it/s]


In [23]:
bins = {}
for i in range(2009,2021):
    bins[i]=[(i-2009)*4+j for j in range(1,5)]
    
print(bins)

{2009: [1, 2, 3, 4], 2010: [5, 6, 7, 8], 2011: [9, 10, 11, 12], 2012: [13, 14, 15, 16], 2013: [17, 18, 19, 20], 2014: [21, 22, 23, 24], 2015: [25, 26, 27, 28], 2016: [29, 30, 31, 32], 2017: [33, 34, 35, 36], 2018: [37, 38, 39, 40], 2019: [41, 42, 43, 44], 2020: [45, 46, 47, 48]}


In [24]:
df_train.loc[df_train['release_year']==2.16,'release_year']=2016
df_train.loc[df_train['release_year']==20116,'release_year']=2016
df_test.loc[df_test['release_year']==2.16,'release_year']=2016
df_test.loc[df_test['release_year']==20116,'release_year']=2016

In [25]:
def release_bins(row):
    """Function that returns the time given the release year"""
    if row['specs_available']==1:
        row['release_bin'] = bins[row['release_year']][int(row['release_month']/4)]
    else:
        row['release_bin'] = 0
    
    return row

In [26]:
df_train = df_train.progress_apply(release_bins, axis=1)
df_test = df_test.progress_apply(release_bins, axis=1)

100%|██████████████████████████████████████████████████████████████████████████| 74645/74645 [00:51<00:00, 1446.13it/s]
100%|████████████████████████████████████████████████████████████████████████| 112071/112071 [01:18<00:00, 1423.15it/s]


In [28]:
#dropping unwanted columns
df_train.drop(columns=['installed_app_categories','active_app_usage_time','active_app_categories','release_month','release_year','installed_apps','active_apps'], inplace=True)
df_test.drop(columns=['installed_app_categories','active_app_usage_time','active_app_categories','release_month','release_year','installed_apps','active_apps'], inplace=True)

In [29]:
print(df_train.columns)
print(df_test.columns)

Index(['device_id', 'brand', 'model', 'gender', 'age', 'group', 'app_usage',
       'app_usage_session', 'activity_hour', 'activity_day', 'num_travels',
       'mean_latitude', 'mean_longitude', 'screen_size', 'ram_gb', 'camera',
       'installed_apps_string', 'active_apps_string', 'specs_available',
       'installed_app_labels', 'active_app_labels', 'installed_app_onehot',
       'active_app_onehot', 'active_app_usage', 'location_available',
       'release_bin'],
      dtype='object')
Index(['device_id', 'brand', 'model', 'app_usage', 'app_usage_session',
       'activity_hour', 'activity_day', 'num_travels', 'mean_latitude',
       'mean_longitude', 'screen_size', 'ram_gb', 'camera',
       'installed_apps_string', 'active_apps_string', 'specs_available',
       'installed_app_labels', 'active_app_labels', 'installed_app_onehot',
       'active_app_onehot', 'active_app_usage', 'location_available',
       'release_bin'],
      dtype='object')


In [30]:
df_train.to_pickle("dataset/train_features.pkl")
df_test.to_pickle("dataset/test_features.pkl")