In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
import gc

# Load Sample Data

In [99]:
PATH = "/home/kai/data/kaggle/talkingdata/data/"
# train = pd.read_csv(PATH + 'train_cleaned_final.csv')
train = pd.read_feather(PATH + 'train_cleaned_final.ftr')

In [5]:
train = train.iloc[:500000]
gc.collect()

12

# General Function

In [6]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(train[col].max() + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

In [58]:
orders

{'app': 1000,
 'channel': 1000,
 'day': 10,
 'device': 10000,
 'hour': 100,
 'ip': 1000000,
 'os': 1000}

# LDA LSA NMF

# Notes:
Next, we tried categorical feature embedding by using LDA/NMF/LSA. Here is the pseudo code to compute LDA topics of IPs related to app.
We computed this feature for all the 20 (=5*(5-1)) combinations of 5 raw features and set the topic size to 5. This ended up with 100 new features. We also computed similar features using NMF and PCA, in total 300 new features. 0.9821 with a single LGB. 

In [None]:
def _lda_nmf_lsa_from_sklearn(key_col_series, value_col_series, n_components):
    dictionary = {}
    key_list = []
    for i in range(key_col_series.shape[0]):
        dictionary.setdefault(key_col_series.iloc[i], []).append(str(value_col_series.iloc[i]))
    key_list = list(dictionary.keys())
    sentences = [' '.join(dictionary[key]) for key in key_list]
    matrix = CountVectorizer().fit_transform(sentences)
    
    lda_dict = dict(zip(key_list, LatentDirichletAllocation(n_components=n_components, n_jobs=16).fit_transform(matrix)))
    nmf_dict = dict(zip(key_list, NMF(n_components=n_components).fit_transform(matrix)))
    lsa_dict = dict(zip(key_list, TruncatedSVD(n_components=n_components).fit_transform(matrix)))
    
    return key_col_series.map(lda_dict), key_col_series.map(nmf_dict), key_col_series.map(lsa_dict)

### Step 1: get key col and value col
the key will be categorized according to values

In [13]:
key_col_series = train.ip
value_col_series = train.app
dictionary = {}
key_list = []
print(key_col_series.iloc[:5])
print('---')
print(value_col_series.iloc[:5])

0     83230
1     17357
2     35810
3     45745
4    161007
Name: ip, dtype: int64
---
0     3
1     3
2     3
3    14
4     3
Name: app, dtype: int64


### Step 2: construct dictionary,
    eg {'ip1':['app1','app2','app4','app1']}
slow ... need improvement

In [14]:
for i in range(key_col_series.shape[0]):
    dictionary.setdefault(key_col_series.iloc[i], []).append(str(value_col_series.iloc[i]))

In [23]:
dictionary[91749]

['3', '18', '15', '12', '9', '12', '9', '3']

### Step 3: construct sentence

In [29]:
key_list = list(dictionary.keys())
sentences = [' '.join(dictionary[key]) for key in key_list]
print(len(sentences))
print(len(train.ip.value_counts()))

31395
31395


In [59]:
key_list[:3]

[83230, 17357, 35810]

In [26]:
sentences[:3]

['3 2 12 9 2 15 6 2 3 3 9 25 25 2 9 3 15 2 14 6 13 15 3 1 12 3 26 6 8 8 9 15 8 1 12 14 3 13 10 18 9 2 11 2 3 14 15 18 3 2 12 9 3 2 36 2',
 '3 3 2 18 12 15 18 12 15 2 9 3 12 15 18 9 12 9 12 9 12 18 15 2 12 15 18 14 2 18 1 9 18 9 15 12 18 9 18 15 12 18 15 12 12 15 18 12 15 18 8 15 9 8 1 8 18 8 9 15 8 8 1 12 3 9 24 20 6 12 3 12 9 12 21 3 12 3 6 3 3 3 25 25 2 9 3 15 12 15 3 2 14 13 1 12',
 '3 3 2 9 13 12 2 15 11 18 3 233 12 3 8 9 15 1 8 12 8 9 15 8 1 8 8 15 9 8 1 8 9 12 12 9 3 9 15 12 18']

### Step 4: get CountVectorize

#### show original version

In [31]:
exp_ori = CountVectorizer()
exp_ori.fit(sentences[:3])
exp_ori.vocabulary_

{'10': 0,
 '11': 1,
 '12': 2,
 '13': 3,
 '14': 4,
 '15': 5,
 '18': 6,
 '20': 7,
 '21': 8,
 '233': 9,
 '24': 10,
 '25': 11,
 '26': 12,
 '36': 13}

In [34]:
import re
re.findall('(?u)\\b\\w\\w+\\b', sentences[0])

['12',
 '15',
 '25',
 '25',
 '15',
 '14',
 '13',
 '15',
 '12',
 '26',
 '15',
 '12',
 '14',
 '13',
 '10',
 '18',
 '11',
 '14',
 '15',
 '18',
 '12',
 '36']

#### show modified version

In [32]:
exp_mod = CountVectorizer(token_pattern='\\b\\w+\\b')
exp_mod.fit(sentences[:3])
exp_mod.vocabulary_

{'1': 0,
 '10': 1,
 '11': 2,
 '12': 3,
 '13': 4,
 '14': 5,
 '15': 6,
 '18': 7,
 '2': 8,
 '20': 9,
 '21': 10,
 '233': 11,
 '24': 12,
 '25': 13,
 '26': 14,
 '3': 15,
 '36': 16,
 '6': 17,
 '8': 18,
 '9': 19}

In [35]:
import re
re.findall('(?u)\\b\\w+\\b', sentences[0])

['3',
 '2',
 '12',
 '9',
 '2',
 '15',
 '6',
 '2',
 '3',
 '3',
 '9',
 '25',
 '25',
 '2',
 '9',
 '3',
 '15',
 '2',
 '14',
 '6',
 '13',
 '15',
 '3',
 '1',
 '12',
 '3',
 '26',
 '6',
 '8',
 '8',
 '9',
 '15',
 '8',
 '1',
 '12',
 '14',
 '3',
 '13',
 '10',
 '18',
 '9',
 '2',
 '11',
 '2',
 '3',
 '14',
 '15',
 '18',
 '3',
 '2',
 '12',
 '9',
 '3',
 '2',
 '36',
 '2']

#### Get CountVectorize Matrix

In [38]:
cvt = CountVectorizer(token_pattern='\\b\\w+\\b')
matrix = cvt.fit_transform(sentences)
print(matrix.shape)
print(len(cvt.vocabulary_))

(31395, 182)
182


In [40]:
cvt.vocabulary_

{'0': 0,
 '1': 1,
 '10': 2,
 '100': 3,
 '102': 4,
 '103': 5,
 '104': 6,
 '105': 7,
 '107': 8,
 '108': 9,
 '109': 10,
 '11': 11,
 '110': 12,
 '112': 13,
 '115': 14,
 '118': 15,
 '119': 16,
 '12': 17,
 '120': 18,
 '121': 19,
 '122': 20,
 '124': 21,
 '125': 22,
 '126': 23,
 '127': 24,
 '128': 25,
 '13': 26,
 '130': 27,
 '136': 28,
 '137': 29,
 '14': 30,
 '141': 31,
 '143': 32,
 '145': 33,
 '146': 34,
 '148': 35,
 '15': 36,
 '150': 37,
 '151': 38,
 '152': 39,
 '153': 40,
 '154': 41,
 '155': 42,
 '158': 43,
 '159': 44,
 '16': 45,
 '160': 46,
 '162': 47,
 '165': 48,
 '166': 49,
 '167': 50,
 '168': 51,
 '17': 52,
 '170': 53,
 '172': 54,
 '173': 55,
 '175': 56,
 '176': 57,
 '18': 58,
 '181': 59,
 '182': 60,
 '183': 61,
 '186': 62,
 '188': 63,
 '19': 64,
 '190': 65,
 '192': 66,
 '193': 67,
 '194': 68,
 '197': 69,
 '2': 70,
 '20': 71,
 '202': 72,
 '207': 73,
 '208': 74,
 '21': 75,
 '210': 76,
 '215': 77,
 '218': 78,
 '22': 79,
 '222': 80,
 '229': 81,
 '23': 82,
 '231': 83,
 '233': 84,
 '239': 85

In [41]:
matrix.todense()[:3,:20]

matrix([[ 0,  2,  1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
          4,  0,  0],
        [ 0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         19,  0,  0],
        [ 0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
          6,  0,  0]], dtype=int64)

### LDA

In [45]:
n_components = 5
lda = LatentDirichletAllocation(n_components=n_components, n_jobs=16)
lda_matrix = lda.fit_transform(matrix)
print(lda_matrix.shape)



(31395, 5)


In [46]:
lda_matrix

array([[ 0.38234714,  0.15388811,  0.34257712,  0.00351763,  0.11767   ],
       [ 0.48401368,  0.03006309,  0.2964955 ,  0.02644147,  0.16298626],
       [ 0.39849761,  0.00502299,  0.15465864,  0.00478641,  0.43703436],
       ..., 
       [ 0.59885595,  0.10000118,  0.10000034,  0.10000262,  0.1011399 ],
       [ 0.10026713,  0.59973186,  0.10000099,  0.1       ,  0.10000002],
       [ 0.10000005,  0.10000001,  0.59999992,  0.10000002,  0.1       ]])

### LSA

In [73]:
n_components = 5
lsa = TruncatedSVD(n_components=n_components)
lsa_matrix = lsa.fit_transform(matrix)
print(lsa_matrix.shape)

(31395, 5)


In [49]:
lsa_matrix

array([[  1.58257976e+01,   6.33504563e+00,  -2.31089716e-01,
          7.35113939e-01,  -2.74577238e+00],
       [  3.21226539e+01,  -2.03122058e+00,  -3.40827026e+00,
         -5.19094325e+00,   2.18860761e+00],
       [  1.28249003e+01,  -7.70624606e-01,  -6.30415413e+00,
          2.81931284e+00,  -4.69762422e-01],
       ..., 
       [  6.93655755e-04,   2.54613819e-04,  -9.72083629e-04,
         -1.91898134e-03,  -1.41713870e-04],
       [  2.77296705e-01,   9.21656987e-01,   2.17072966e-01,
          1.37395002e-01,   5.78570024e-03],
       [  6.23419986e-02,   1.21431713e-03,  -1.39939508e-02,
         -4.25817591e-02,  -6.00880278e-02]])

In [50]:
print(lsa.explained_variance_ratio_)
# Percentage of variance explained by each of the selected components.

array([ 0.7452255 ,  0.10477226,  0.06171417,  0.02703803,  0.01768965])

In [51]:
lsa.singular_values_ 

array([ 2841.98029924,  1023.70565176,   786.0277348 ,   518.38397405,
         419.08936088])

### NMF
Non-Negative Matrix Factorization (NMF)

In [74]:
n_components = 5
nmf = NMF(n_components=n_components)
nmf_matrix = nmf.fit_transform(matrix)
print(nmf_matrix.shape)

(31395, 5)


In [57]:
nmf_matrix

array([[  1.93122060e-01,   2.91712583e-01,   1.32214252e-01,
          1.25294191e-01,   0.00000000e+00],
       [  4.53978766e-01,   1.60075496e-01,   2.30497781e-01,
          3.85526244e-01,   9.57598566e-02],
       [  1.62191821e-01,   5.43949114e-02,   3.08390301e-01,
          6.31075561e-02,   3.59693074e-03],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40005367e-05,   2.68245633e-05],
       [  0.00000000e+00,   2.81677037e-02,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  5.88626976e-04,   2.42124497e-04,   5.00473551e-05,
          1.67553030e-03,   0.00000000e+00]])

In [68]:
t1 = ['apple apple1 apple2']
t2 = ['apple apple1  apple3']

In [69]:
x = CountVectorizer()
x.fit(t1)
t1_t = x.transform(t1)
t2_t = x.transform(t2)

In [70]:
t1_t.todense()

matrix([[1, 1, 1]])

In [71]:
t2_t.todense()

matrix([[1, 1, 0]])

### Click in next N

In [None]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(train[col].max() + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

In [119]:
from collections import defaultdict
from tqdm import tqdm

def _click_count_in_next_n_hour(df, cols, n_hour, time_col):
    # dataframe should maintain a descending order
    rev_train = df.sort_index(ascending=False)
    encodings = get_group(rev_train, cols).values
    times = rev_train[time_col].values
    
    dict_count = defaultdict(int)
    result = []
    bound = 0
    for cur in tqdm(range(len(encodings))):

        while times[bound] - times[cur] > n_hour:
            dict_count[encodings[bound]] -= 1
            bound += 1
        encoding = encodings[cur]
        result.append(dict_count[encoding])
        dict_count[encoding] += 1
    return result[::-1]
        
def _click_count_in_previous_n_hour(df, cols, n_hour, time_col):
    # dataframe should maintain an ascending order
    encodings = get_group(df, cols).values
    times = df[time_col].values
    
    dict_count = defaultdict(int)
    result = []
    bound = 0
    for cur in tqdm(range(len(encodings))):

        while times[cur] - times[bound] > n_hour:
            dict_count[encodings[bound]] -= 1
            bound += 1
        encoding = encodings[cur]
        result.append(dict_count[encoding])
        dict_count[encoding] += 1
    return result

In [113]:
gap = 60*60*6
nextc = _click_count_in_next_n_hour(train,['app'],gap, 'timestamp')

100%|██████████| 184903890/184903890 [03:27<00:00, 889670.75it/s]


In [120]:
gap = 60*60*6
prevc = _click_count_in_previous_n_hour(train,['app'],gap, 'timestamp')

100%|██████████| 184903890/184903890 [03:16<00:00, 939635.59it/s]
