# Analyze devices and browsers



In [2]:
import numpy as np
import pandas as pd
import os
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import eli5
import shap
from IPython.display import HTML
import json
import altair as alt

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

# alt.renderers.enable('notebook')

In [90]:
folder_path = './data/'
files = [f'{folder_path}train_identity.csv', 
         f'{folder_path}test_identity.csv']

from functions import load_data
train_identity,  test_identity = map(load_data, files)

train_identity.drop(['id_%02d'%i for i in list(range(1,30)) + list(range(34,39))+[32]],axis=1,inplace=True)
test_identity.drop(['id_%02d'%i for i in list(range(1,30)) + list(range(34,39))+[32]],axis=1,inplace=True)

data = pd.concat([train_identity, test_identity], axis=0, sort=False )
data.set_index('TransactionID',inplace=True)

del train_identity
del test_identity

In [91]:
data['OS'] = np.NaN
data['OSVersion'] = np.NaN

In [92]:
data['id_30'].value_counts()

Windows 10          42170
Windows 7           23478
iOS 12.1.0           6349
Mac OS X 10_12_6     3884
iOS 11.2.1           3824
                    ...  
Mac OS X 10_12_2       51
Mac OS X 10_6_8        50
func                   21
other                  19
Windows                 6
Name: id_30, Length: 87, dtype: int64

In [93]:
def create_dev_map_dict():
    import re
    vc = data['id_30'].value_counts()
    id_30_to_OS = {}
    id_30_to_OSVersion = {}
    for s in vc.index:
        M = re.match(r'Windows\s+(\S+)',s)
        if M is not None:
            id_30_to_OS[s] = 'Windows'
            id_30_to_OSVersion[s] = M.groups()[0]
            continue
        M = re.match(r'iOS\s+(\S+)',s)
        if M is not None:
            id_30_to_OS[s] = 'iOS'
            id_30_to_OSVersion[s] = M.groups()[0]
            continue
        M = re.match(r'Mac OS X\s+(\S+)',s)
        if M is not None:
            id_30_to_OS[s] = 'Mac'
            id_30_to_OSVersion[s] = M.groups()[0]
            continue   
        M = re.match(r'Android\s+(\S+)',s)
        if M is not None:
            id_30_to_OS[s] = 'Android '
            id_30_to_OSVersion[s] = M.groups()[0]
            continue         
        id_30_to_OS[s] = s
    
    return id_30_to_OS, id_30_to_OSVersion

id_30_to_OS, id_30_to_OSVersion = create_dev_map_dict()


In [94]:
found_index = data.loc[data['id_30'].isin(id_30_to_OS.keys())].index
data.loc[found_index,'OS'] = data.loc[found_index]['id_30'].replace(id_30_to_OS)
data.loc[found_index,'OSVersion'] = data.loc[found_index]['id_30'].replace(id_30_to_OSVersion)

In [95]:
data.head()

Unnamed: 0_level_0,id_30,id_31,id_33,DeviceType,DeviceInfo,OS,OSVersion
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2987004,Android 7.0,samsung browser 6.2,2220x1080,mobile,SAMSUNG SM-G892A Build/NRD90M,Android,7.0
2987008,iOS 11.1.2,mobile safari 11.0,1334x750,mobile,iOS Device,iOS,11.1.2
2987010,,chrome 62.0,,desktop,Windows,,
2987011,,chrome 62.0,,desktop,,,
2987016,Mac OS X 10_11_6,chrome 62.0,1280x800,desktop,MacOS,Mac,10_11_6


In [96]:
data['screen_width'] = data['id_33'].str.split('x', expand=True)[0]
data['screen_height'] = data['id_33'].str.split('x', expand=True)[1]

In [97]:
data['device_name'] = data['DeviceInfo'].str.split('/', expand=True)[0]
data['device_version'] = data['DeviceInfo'].str.split('/', expand=True)[1]
    
data.loc[data['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
data.loc[data['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
data.loc[data['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
data.loc[data['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
data.loc[data['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
data.loc[data['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
data.loc[data['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
data.loc[data['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
data.loc[data['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
data.loc[data['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
data.loc[data['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
data.loc[data['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
data.loc[data['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
data.loc[data['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
data.loc[data['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
data.loc[data['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
data.loc[data['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'
data.loc[data.device_name.isin(data.device_name.value_counts()[data.device_name.value_counts() < 200].index), 'device_name'] = "Others"


In [99]:
cols = ['OS','OSVersion',	'screen_width',	'screen_height'	,'device_name',	'device_version']

In [100]:
with pd.HDFStore('./hdf/Store.h5') as store:
    for col in cols:
        store[f'features/{col}'] = data[col]

In [18]:
data['id_31'].value_counts()

mobile safari 11.0     23655
chrome 63.0            22168
chrome 70.0            16054
ie 11.0 for desktop    14203
mobile safari 12.0     13098
                       ...  
Nokia/Lumia                1
rim                        1
seamonkey                  1
iron                       1
Cherry                     1
Name: id_31, Length: 172, dtype: int64

In [45]:
def create_browser_map_dict():
    import re
    vc = data['id_31'].value_counts()
    id_31_to_Browser = {}
    id_31_to_BrowserVersion = {}
    for s in vc.index:
        M = re.match(r'mobile safari\s+(\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'mobile safari'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue
        M = re.match(r'chrome\s+(\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'chrome'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue    
        M = re.match(r'ie (\S+) for desktop',s)
        if M is not None:
            id_31_to_Browser[s] = 'ie desktop'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue
        M = re.match(r'safari generic',s)
        if M is not None:
            id_31_to_Browser[s] = 'safari'
            id_31_to_BrowserVersion[s] = np.NaN
            continue              
        M = re.match(r'safari (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'safari'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue              
        M = re.match(r'edge (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'edge'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue       
        M = re.match(r'firefox (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'firefox'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue 
        M = re.match(r'samsung browser (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'samsung browser'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue   
        M = re.match(r'ie (\S+) for tablet',s)
        if M is not None:
            id_31_to_Browser[s] = 'ie tablet'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue             
        M = re.match(r'google search application (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'google search application'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue    
        M = re.match(r'android webview (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'android webview'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue
        M = re.match(r'android browser (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'android browser'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue               
        M = re.match(r'opera (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'opera'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue               
        M = re.match(r'Generic/Android (\S+)',s)
        if M is not None:
            id_31_to_Browser[s] = 'Generic/Android'
            id_31_to_BrowserVersion[s] = M.groups()[0]
            continue               
        id_31_to_Browser[s] = s
        id_31_to_BrowserVersion[s] = np.NaN
        
        
    return id_31_to_Browser, id_31_to_BrowserVersion

id_31_to_Browser, id_31_to_BrowserVersion = create_browser_map_dict()


In [46]:
data['Browser'] = np.NaN
data['BrowserVersion'] = np.NaN
found_index = data.loc[data['id_31'].isin(id_31_to_Browser.keys())].index
data.loc[found_index,'Browser'] = data.loc[found_index]['id_31'].replace(id_31_to_Browser)
data.loc[found_index, 'BrowserVersion'] = data['id_31'].replace(id_31_to_BrowserVersion)

In [47]:
data.loc[data.Browser.isin(data.Browser.value_counts()[data.Browser.value_counts() < 10].index), 'Browser'] = "other"

In [51]:
l = list(data[data.Browser == 'edge']['BrowserVersion'].value_counts().index)
print(l)
# l.remove('mobile')
# l.remove('generic')
# l.remove('chrome')
for v in sorted([int(float(i)) for i in l]):
    print(f'edge["{v}"]=""')

['17.0', '16.0', '15.0', '14.0', '13.0', '18.0']
edge["13"]=""
edge["14"]=""
edge["15"]=""
edge["16"]=""
edge["17"]=""
edge["18"]=""


In [52]:
edge = {}
edge["13"]="2015-09-18"
edge["14"]="2016-02-18"
edge["15"]="2016-10-07"
edge["16"]="2017-09-26"
edge["17"]="2018-04-30"
edge["18"]="2018-11-13"
edge_map={}
for k,v in edge.items():
    edge_map[str(k)+'.0'] = datetime.datetime.strptime(v, "%Y-%m-%d")

In [53]:
firefox = {}
firefox["47"]="2016-06-07"
firefox["48"]="2016-08-01"
firefox["52"]="2017-03-07"
firefox["55"]="2017-08-08"
firefox["56"]="2017-09-28"
firefox["57"]="2017-11-14"
firefox["58"]="2018-01-23"
firefox["59"]="2018-03-13"
firefox["60"]="2018-05-09"
firefox["61"]="2018-06-26"
firefox["62"]="2018-09-05"
firefox["63"]="2018-10-23"
firefox["64"]="2018-12-11"
firefox_map={}
for k,v in firefox.items():
    firefox_map[str(k)+'.0'] = datetime.datetime.strptime(v, "%Y-%m-%d")

In [54]:
safari = {}
safari["9"]="2015-09-30"
safari["10"]="2016-09-20"
safari["11"]="2017-09-19"
safari["12"]="2018-09-17"
safari_map={}
for k,v in safari.items():
    safari_map[str(k)+'.0'] = datetime.datetime.strptime(v, "%Y-%m-%d")

In [55]:
chrome = {}
chrome["39"]="2014-11-18"
chrome["43"]="2015-05-19"
chrome["46"]="2015-10-13"
chrome["49"]="2016-03-02"
chrome["50"]="2016-04-13"
chrome["51"]="2016-05-25"
chrome["52"]="2016-07-20"
chrome["53"]="2016-08-31"
chrome["54"]="2016-10-12"
chrome["55"]="2016-12-01"
chrome["56"]="2017-01-25"
chrome["57"]="2017-03-09"
chrome["58"]="2017-04-19"
chrome["59"]="2017-06-05"
chrome["60"]="2017-07-25"
chrome["61"]="2017-09-05"
chrome["62"]="2017-10-17"
chrome["63"]="2017-12-05"
chrome["64"]="2018-01-24"
chrome["65"]="2018-03-06"
chrome["66"]="2018-04-17"
chrome["67"]="2018-05-29"
chrome["68"]="2018-07-24"
chrome["69"]="2018-09-04"
chrome["70"]="2018-10-16"
chrome["71"]="2018-12-04"
chrome_map={}
for k,v in chrome.items():
    chrome_map[str(k)+'.0'] = datetime.datetime.strptime(v, "%Y-%m-%d")

In [56]:
data['BrowserAge'] = np.NaN

In [57]:
with pd.HDFStore('./hdf/Store.h5',complib='blosc:blosclz') as store:
    full_data = store['initial/encoded']


In [58]:
# full_data['Date']
data = pd.merge(data, full_data['Date'], on='TransactionID', how='left')

In [60]:
chrome_map

{'39.0': datetime.datetime(2014, 11, 18, 0, 0),
 '43.0': datetime.datetime(2015, 5, 19, 0, 0),
 '46.0': datetime.datetime(2015, 10, 13, 0, 0),
 '49.0': datetime.datetime(2016, 3, 2, 0, 0),
 '50.0': datetime.datetime(2016, 4, 13, 0, 0),
 '51.0': datetime.datetime(2016, 5, 25, 0, 0),
 '52.0': datetime.datetime(2016, 7, 20, 0, 0),
 '53.0': datetime.datetime(2016, 8, 31, 0, 0),
 '54.0': datetime.datetime(2016, 10, 12, 0, 0),
 '55.0': datetime.datetime(2016, 12, 1, 0, 0),
 '56.0': datetime.datetime(2017, 1, 25, 0, 0),
 '57.0': datetime.datetime(2017, 3, 9, 0, 0),
 '58.0': datetime.datetime(2017, 4, 19, 0, 0),
 '59.0': datetime.datetime(2017, 6, 5, 0, 0),
 '60.0': datetime.datetime(2017, 7, 25, 0, 0),
 '61.0': datetime.datetime(2017, 9, 5, 0, 0),
 '62.0': datetime.datetime(2017, 10, 17, 0, 0),
 '63.0': datetime.datetime(2017, 12, 5, 0, 0),
 '64.0': datetime.datetime(2018, 1, 24, 0, 0),
 '65.0': datetime.datetime(2018, 3, 6, 0, 0),
 '66.0': datetime.datetime(2018, 4, 17, 0, 0),
 '67.0': datet

In [86]:
supported_browsers = [
    ('chrome', chrome_map),
    ('safari', safari_map),
    ('edge', edge_map),
    ('firefox',firefox_map)
]
for browser, browser_map in supported_browsers:
    idx = data[data.Browser == browser][data.BrowserVersion.isin(browser_map.keys())].index
    fdata = data[data.Browser == browser]
    fdata.loc[idx,'BrowserAge'] = ((fdata.loc[idx]['Date'].astype('datetime64[s]') -  (fdata.loc[idx]['BrowserVersion'].replace(browser_map)).astype('datetime64[s]')))/ np.timedelta64(1, 'D')


In [88]:
cols = ['Browser', 'BrowserVersion', 'BrowserAge']
with pd.HDFStore('./hdf/Store.h5') as store:
    for col in cols:
        store[f'features/{col}'] = fdata[col]

In [5]:
d.value_counts()

57.0       3331
63.0       1899
61.0       1321
64.0       1251
59.0       1152
62.0       1107
58.0        851
52.0        621
60.0        607
55.0        528
48.0        254
mobile      223
56.0        156
generic     110
47.0         51
Name: BrowserVersion, dtype: int64

In [16]:
from functions import LabelEncoderPopularity
#encode
categorial = ['Browser',  'OS', 'OSVersion', 'device_name', 'device_version']
numerical = ['BrowserAge', 'BrowserVersion', 'screen_height', 'screen_width']
with pd.HDFStore('./hdf/Store.h5') as store:
    num_features = store.get_node('features')
    for feature in num_features:
        feature_name = feature._v_name
        print(feature_name)
        d = store.select(feature._v_pathname)
        if feature_name in numerical:
            if feature_name == 'BrowserVersion':
                d.replace({'mobile': 100, 'generic': 0}, inplace=True)
            d = d.astype(np.float32)
        elif feature_name in categorial:
            L = LabelEncoderPopularity(convert_nan=True)
            L.fit(d)
            d = L.transform(d)
        else:
            print ('Unknown ', feature_name)
            break
#         print('features_encoded/'+feature_name)
        store.put(f'encodedFeatures/{feature_name}',d)
            

Browser
BrowserAge
BrowserVersion
OS
OSVersion
device_name
device_version
screen_height
screen_width
