# Import

In [1]:
# !pip install sentence_transformers

In [2]:
from psutil import virtual_memory
virtual_memory().available / 1e9

53.545017344

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

from tqdm import tqdm

import numpy as np
import pandas as pd

# from sentence_transformers import SentenceTransformer

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

import pickle

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Load data

In [4]:
train = pd.read_csv('drive/MyDrive/spb/train.csv')

# Work with tokens

## Choice representative tokens

In [None]:
# tokens = []
# tokens_scores = []
# tokens_indxs = []

# for i in tqdm(range(train.shape[0])):
#     x = train.tokens.iloc[i]
#     if type(x) == str:
#         x = x.split(' ')
#         for j in range(0, len(x), 2):

#             tokens_indxs.append(i)
            
#             tokens.append(x[j])
#             tokens_scores.append(x[j+1])


# tokens_data = pd.DataFrame()
# tokens_data['indx'] = tokens_indxs
# tokens_data['users'] = train.CLIENT_ID.values[tokens_indxs]
# tokens_data['items'] = tokens
# tokens_data['freq'] = tokens_scores
# tokens_data['target'] = train.DEF.values[tokens_indxs]

# tokens_data.to_feather('tokens_data.ftr')

In [5]:
tokens_data = pd.read_feather('tokens_data.ftr')

tokens_data_agg = tokens_data['items'].value_counts().reset_index()
tokens_data_agg = tokens_data_agg.rename(columns={'items': 'all_counts', 'index': 'items'})

tokens_data_agg = pd.merge(tokens_data_agg,
                           tokens_data[tokens_data.target == 0]['items'].value_counts().reset_index().rename(columns={
                               'items': '0_counts', 'index': 'items'
                               }),
                           on='items', how='left')

tokens_data_agg = pd.merge(tokens_data_agg,
                           tokens_data[tokens_data.target == 1]['items'].value_counts().reset_index().rename(columns={
                               'items': '1_counts', 'index': 'items'
                               }),
                           on='items', how='left')

tokens_data_agg['0_counts'] = tokens_data_agg['0_counts'].fillna(0)
tokens_data_agg['1_counts'] = tokens_data_agg['1_counts'].fillna(0)

tokens_data_agg['0_1_ratio'] = tokens_data_agg['1_counts'] / tokens_data_agg['0_counts']

tokens_data_agg.to_feather('drive/MyDrive/spb/tokens_data_agg.ftr')

In [13]:
ratio_default = train[train.DEF == 1].shape[0] / train[train.DEF == 0].shape[0]
ratio_shift = 0.25
all_counts_th = tokens_data_agg.all_counts.median()

tokens_data_agg_choosen = tokens_data_agg[(tokens_data_agg.all_counts >= all_counts_th)
                                          &((tokens_data_agg['0_1_ratio'] < ratio_default - ratio_shift*ratio_default)
                                          |(tokens_data_agg['0_1_ratio'] > ratio_default + ratio_shift*ratio_default))]

In [14]:
good_tokens_data_agg_choosen = set(tokens_data_agg_choosen['items'].values)

In [15]:
tokens_processed = []
ratio_mean_0_1 = []

for x in tqdm(train.tokens):
    if type(x) == str:
        x = list(set(x.split(' ')) & good_tokens_data_agg_choosen)
        ratio_mean_0_1.append(tokens_data_agg_choosen[tokens_data_agg_choosen['items'].isin(x)]['0_1_ratio'].mean())
        tokens_processed.append(' '.join(x))
    else:
        ratio_mean_0_1.append(ratio_default)
        tokens_processed.append('')

100%|██████████| 303777/303777 [13:26<00:00, 376.64it/s]


In [16]:
temp = pd.DataFrame()
temp['tokens_processed'] = tokens_processed
temp['tokens_ratio_mean_0_1'] = ratio_mean_0_1
temp['target'] = train.DEF.values
temp.to_feather('drive/MyDrive/spb/train_processed.ftr')

## Transformer embeddings on sentences

In [5]:
train_data = pd.read_feather('drive/MyDrive/spb/train_processed.ftr')

In [None]:
embed_model = SentenceTransformer('LaBSE')

In [7]:
embeddings = embed_model.encode(train_data.tokens_processed.values,
                                show_progress_bar=True)

Batches:   0%|          | 0/9494 [00:00<?, ?it/s]

In [8]:
temp = pd.DataFrame()
temp[['labse_'+str(i) for i in range(embeddings.shape[1])]] = embeddings

  self[col] = igetitem(value, i)


In [9]:
temp.to_feather('drive/MyDrive/spb/LaBSE_embeddings.ftr')

## Transformer embeddings on selected words (TOO LONG)

In [19]:
# embed_model = SentenceTransformer('DeepPavlov/distilrubert-base-cased-conversational')

In [20]:
# word_embeddings = []

# for x in tqdm(train_data.tokens_processed):
#     word_embeddings.append(np.mean(embed_model.encode(x.split(' ')), axis=0))

## SVD features

### Functions

In [8]:
def transform_indices(data, users, items):
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        idx, idx_map = to_numeric_id(data, field)
        data_index[entity] = idx_map
        data.loc[:, field] = idx
    return data, data_index

def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def matrix_from_data(data, data_description, dtype=None):
    '''
    Converts pandas DataFrame into sparse CSR matrix.
    Assumes data in the DataFrame is alread normalized via `transform_indices`.
    '''
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback_data = data_description.get('feedback', None)
    if feedback_data is not None:
        feedback = data[feedback_data].values
    else:
        feedback = np.ones(len(user_idx))
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    return csr_matrix((feedback, (user_idx, item_idx)), shape=shape, dtype=dtype)


def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_data(data, data_description).asfptype()
    _, s, vt = svds(source_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, singular_values

### Prepare data and make SVD

In [61]:
train = pd.read_csv('drive/MyDrive/spb/train.csv', usecols=['CLIENT_ID'])
train_data = pd.read_feather('drive/MyDrive/spb/train_processed.ftr')

In [62]:
tokens = []
tokens_indxs = []

for i in tqdm(range(train_data.shape[0])):
    x = train_data.tokens_processed.iloc[i]
    if type(x) == str:
        x = x.split(' ')
        for j in range(len(x)):
            tokens_indxs.append(i)
            tokens.append(x[j])

tokens_data = pd.DataFrame()
tokens_data['users'] = train.CLIENT_ID.values[tokens_indxs]
tokens_data['items'] = tokens
tokens_data['feedback'] = train_data.target.values[tokens_indxs]

tokens_data['items'] = tokens_data['items'].fillna('')

# tokens_data.to_feather('drive/MyDrive/spb/tokens_data.ftr')

100%|██████████| 303777/303777 [00:15<00:00, 19633.40it/s]


In [64]:
tokens_data, tokens_data_index = transform_indices(tokens_data, 'users', 'items')

data_description = dict(
    users = tokens_data_index['users'].name,
    items = tokens_data_index['items'].name,
    feedback = 'feedback',
    n_users = len(tokens_data_index['users']),
    n_items = len(tokens_data_index['items']),
)
data_description

{'users': 'users',
 'items': 'items',
 'feedback': 'feedback',
 'n_users': 303777,
 'n_items': 16455}

In [66]:
with open('drive/MyDrive/spb/tokens_data_index.pkl', 'wb') as f:
    pickle.dump(tokens_data_index, f)
        
with open('drive/MyDrive/spb/tokens_data_index.pkl', 'rb') as f:
    tokens_data_index = pickle.load(f)

In [68]:
svd_params = build_svd_model(
    {'rank': 512},
    tokens_data,
    data_description
)

item_factors, singvals = svd_params
# item_factors[:, :rank], singvals[:rank]

In [69]:
with open('drive/MyDrive/spb/tokens_svd_params', 'wb') as f:
    pickle.dump(svd_params, f)

with open('drive/MyDrive/spb/tokens_svd_params', 'rb') as f:
    svd_params = pickle.load(f)

item_factors, singvals = svd_params

In [70]:
item_factors[np.where(tokens_data_index['items'].isin(train_data.tokens_processed.iloc[0].split(' ')))[0]]

array([[-0.02346056, -0.01449324, -0.06026073, ...,  0.01530854,
        -0.00370783, -0.00530452],
       [-0.0271323 , -0.01410808, -0.0638887 , ..., -0.04364347,
         0.00885151,  0.01223644],
       [-0.03160005, -0.0195356 , -0.06111496, ..., -0.01289258,
        -0.0265864 , -0.01211456],
       ...,
       [-0.01061813, -0.00892987, -0.00258476, ...,  0.00318992,
         0.01431621,  0.01324005],
       [-0.02297744, -0.01469494, -0.06134045, ...,  0.02196294,
         0.00171998,  0.00991457],
       [-0.00689277, -0.00558148, -0.00423889, ..., -0.00138592,
         0.01165234, -0.00078933]])

In [74]:
svd_embeddings_tokens = []

for x in tqdm(train_data.tokens_processed):
    svd_embeddings_tokens.append(np.mean(item_factors[np.where(tokens_data_index['items'].isin(x.split(' ')))[0]], axis=0))

100%|██████████| 303777/303777 [06:15<00:00, 809.10it/s]


In [75]:
temp = pd.DataFrame()
temp[['token_svd'+str(i) for i in range(512)]] = svd_embeddings_tokens

  self[col] = igetitem(value, i)


In [76]:
temp

Unnamed: 0,token_svd0,token_svd1,token_svd2,token_svd3,token_svd4,token_svd5,token_svd6,token_svd7,token_svd8,token_svd9,...,token_svd502,token_svd503,token_svd504,token_svd505,token_svd506,token_svd507,token_svd508,token_svd509,token_svd510,token_svd511
0,-0.025618,0.001603,-0.025271,0.024141,-0.011024,0.009730,-0.013304,-0.018563,0.026192,-0.005080,...,0.012672,0.005334,-0.002362,-0.005492,-0.003683,-0.006354,-0.010796,0.008326,0.000775,0.006198
1,-0.030923,0.022015,-0.001370,-0.004261,0.003452,-0.002362,-0.003983,0.005825,-0.003140,-0.005729,...,-0.001201,-0.002972,0.000241,-0.001230,0.001829,-0.000176,-0.000407,-0.000566,0.000105,0.000304
2,-0.011306,-0.009623,0.004240,0.000900,0.004812,0.002258,0.003669,0.002179,0.000548,-0.007150,...,0.003337,0.001280,0.001633,-0.001175,-0.001698,0.002102,-0.001516,-0.003324,0.000352,0.001588
3,-0.040344,0.034953,-0.007776,0.039196,-0.010065,-0.006125,0.001590,0.006649,-0.007654,-0.001558,...,0.002134,-0.000290,0.000580,-0.000577,0.002394,0.002347,0.002382,0.002308,0.000460,-0.001242
4,-0.010803,-0.005876,-0.002457,-0.000753,-0.000412,0.002586,-0.004238,-0.000149,-0.000551,-0.000883,...,0.000125,-0.000691,-0.000151,-0.000861,-0.000313,-0.000888,-0.001240,-0.000495,-0.000007,0.000165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303772,-0.006467,-0.003028,-0.008880,-0.000183,0.005209,0.000678,-0.008070,0.002846,-0.006156,-0.003117,...,-0.001166,0.010925,-0.002809,-0.000232,-0.021474,-0.007430,-0.010603,-0.025468,0.022997,0.025860
303773,-0.003802,-0.001934,-0.003716,-0.001361,0.001409,-0.008042,-0.007708,-0.000085,-0.002688,0.000967,...,0.020165,-0.001577,-0.006597,-0.013066,0.010729,-0.006092,0.001467,-0.001987,-0.004562,-0.010100
303774,-0.010688,-0.003417,-0.000979,-0.002496,0.000781,-0.001891,-0.002404,-0.000037,-0.000483,-0.000304,...,0.001346,0.002532,0.001338,0.000189,-0.001413,-0.000343,0.001752,-0.002234,0.001800,-0.001020
303775,-0.008410,-0.007036,-0.002794,0.000598,-0.001468,0.000165,0.002766,-0.000552,0.002593,0.000527,...,-0.001883,-0.000814,-0.000434,-0.004515,0.001052,-0.001342,0.003149,0.000180,0.001109,-0.002020


In [77]:
temp.to_feather('drive/MyDrive/spb/svd_embeddings_tokens.ftr')

# Load data

In [19]:
train = pd.read_csv('drive/MyDrive/spb/train.csv')

# Work with urls

## Choice representative tokens

In [20]:
urls = []
urls_scores = []
urls_indxs = []

for i in tqdm(range(train.shape[0])):
    x = train.urls_hashed.iloc[i]
    if type(x) == str:
        x = x.split(' ')
        for j in range(0, len(x), 2):

            urls_indxs.append(i)
            
            urls.append(x[j])
            urls_scores.append(x[j+1])


urls_data = pd.DataFrame()
urls_data['indx'] = urls_indxs
urls_data['users'] = train.CLIENT_ID.values[urls_indxs]
urls_data['items'] = urls
urls_data['freq'] = urls_scores
urls_data['target'] = train.DEF.values[urls_indxs]

urls_data.to_feather('urls_data.ftr')

100%|██████████| 303777/303777 [00:26<00:00, 11644.76it/s]


In [21]:
urls_data = pd.read_feather('urls_data.ftr')

urls_data_agg = urls_data['items'].value_counts().reset_index()
urls_data_agg = urls_data_agg.rename(columns={'items': 'all_counts', 'index': 'items'})

urls_data_agg = pd.merge(urls_data_agg,
                         urls_data[urls_data.target == 0]['items'].value_counts().reset_index().rename(columns={
                               'items': '0_counts', 'index': 'items'
                               }),
                           on='items', how='left')

urls_data_agg = pd.merge(urls_data_agg,
                           urls_data[urls_data.target == 1]['items'].value_counts().reset_index().rename(columns={
                               'items': '1_counts', 'index': 'items'
                               }),
                           on='items', how='left')

urls_data_agg['0_counts'] = urls_data_agg['0_counts'].fillna(0)
urls_data_agg['1_counts'] = urls_data_agg['1_counts'].fillna(0)

urls_data_agg['0_1_ratio'] = urls_data_agg['1_counts'] / urls_data_agg['0_counts']

urls_data_agg.to_feather('drive/MyDrive/spb/urls_data_agg.ftr')

In [29]:
ratio_default = train[train.DEF == 1].shape[0] / train[train.DEF == 0].shape[0]
ratio_shift = 0.25
all_counts_th = urls_data_agg.all_counts.quantile(0.9)

urls_data_agg_choosen = urls_data_agg[(urls_data_agg.all_counts >= all_counts_th)
                                          &((urls_data_agg['0_1_ratio'] < ratio_default - ratio_shift*ratio_default)
                                          |(urls_data_agg['0_1_ratio'] > ratio_default + ratio_shift*ratio_default))]

In [31]:
good_urls_data_agg_choosen = set(urls_data_agg_choosen['items'].values)

In [32]:
urls_processed = []
ratio_mean_0_1 = []

for x in tqdm(train.urls_hashed):
    if type(x) == str:
        x = list(set(x.split(' ')) & good_urls_data_agg_choosen)
        ratio_mean_0_1.append(urls_data_agg_choosen[urls_data_agg_choosen['items'].isin(x)]['0_1_ratio'].mean())
        urls_processed.append(' '.join(x))
    else:
        ratio_mean_0_1.append(ratio_default)
        urls_processed.append('')

100%|██████████| 303777/303777 [23:42<00:00, 213.49it/s]


In [33]:
temp = pd.DataFrame()
temp['urls_processed'] = urls_processed
temp['urls_ratio_mean_0_1'] = ratio_mean_0_1
temp['target'] = train.DEF.values
temp.to_feather('drive/MyDrive/spb/urls_train_processed.ftr')

## SVD features

### Functions

In [78]:
def transform_indices(data, users, items):
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        idx, idx_map = to_numeric_id(data, field)
        data_index[entity] = idx_map
        data.loc[:, field] = idx
    return data, data_index

def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def matrix_from_data(data, data_description, dtype=None):
    '''
    Converts pandas DataFrame into sparse CSR matrix.
    Assumes data in the DataFrame is alread normalized via `transform_indices`.
    '''
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback_data = data_description.get('feedback', None)
    if feedback_data is not None:
        feedback = data[feedback_data].values
    else:
        feedback = np.ones(len(user_idx))
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    return csr_matrix((feedback, (user_idx, item_idx)), shape=shape, dtype=dtype)


def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_data(data, data_description).asfptype()
    _, s, vt = svds(source_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, singular_values

### Prepare data and make SVD

In [79]:
train = pd.read_csv('drive/MyDrive/spb/train.csv', usecols=['CLIENT_ID'])
train_data = pd.read_feather('drive/MyDrive/spb/urls_train_processed.ftr')

In [80]:
urls = []
urls_indxs = []

for i in tqdm(range(train_data.shape[0])):
    x = train_data.urls_processed.iloc[i]
    if type(x) == str:
        x = x.split(' ')
        for j in range(len(x)):
            urls_indxs.append(i)
            urls.append(x[j])

urls_data = pd.DataFrame()
urls_data['users'] = train.CLIENT_ID.values[urls_indxs]
urls_data['items'] = urls
urls_data['feedback'] = train_data.target.values[urls_indxs]

urls_data['items'] = urls_data['items'].fillna('')

# tokens_data.to_feather('drive/MyDrive/spb/tokens_data.ftr')

100%|██████████| 303777/303777 [00:06<00:00, 47539.57it/s]


In [81]:
urls_data, urls_data_index = transform_indices(urls_data, 'users', 'items')

data_description = dict(
    users = urls_data_index['users'].name,
    items = urls_data_index['items'].name,
    feedback = 'feedback',
    n_users = len(urls_data_index['users']),
    n_items = len(urls_data_index['items']),
)
data_description

{'users': 'users',
 'items': 'items',
 'feedback': 'feedback',
 'n_users': 303777,
 'n_items': 32587}

In [84]:
with open('drive/MyDrive/spb/urls_data_index.pkl', 'wb') as f:
    pickle.dump(urls_data_index, f)
        
with open('drive/MyDrive/spb/urls_data_index.pkl', 'rb') as f:
    urls_data_index = pickle.load(f)

In [86]:
svd_params = build_svd_model(
    {'rank': 512},
    urls_data,
    data_description
)

item_factors, singvals = svd_params
# item_factors[:, :rank], singvals[:rank]

In [87]:
with open('drive/MyDrive/spb/urls_svd_params', 'wb') as f:
    pickle.dump(svd_params, f)

with open('drive/MyDrive/spb/urls_svd_params', 'rb') as f:
    svd_params = pickle.load(f)

item_factors, singvals = svd_params

In [90]:
svd_embeddings_urls = []

for x in tqdm(train_data.urls_processed):
    svd_embeddings_urls.append(np.mean(item_factors[np.where(urls_data_index['items'].isin(x.split(' ')))[0]], axis=0))

100%|██████████| 303777/303777 [08:59<00:00, 563.52it/s]


In [91]:
temp = pd.DataFrame()
temp[['urls_svd'+str(i) for i in range(512)]] = svd_embeddings_urls

  self[col] = igetitem(value, i)


In [92]:
temp

Unnamed: 0,urls_svd0,urls_svd1,urls_svd2,urls_svd3,urls_svd4,urls_svd5,urls_svd6,urls_svd7,urls_svd8,urls_svd9,...,urls_svd502,urls_svd503,urls_svd504,urls_svd505,urls_svd506,urls_svd507,urls_svd508,urls_svd509,urls_svd510,urls_svd511
0,-0.000723,0.001216,-0.002138,-2.722877e-16,-0.001446,-0.001577,0.000120,0.000246,0.000175,-0.000331,...,-0.009556,0.009382,0.003504,0.004731,0.003349,-0.005997,0.002606,0.006116,0.008277,0.010517
1,-0.055528,0.008235,-0.021262,-4.367629e-15,0.010034,0.009677,0.008870,-0.004536,-0.008857,-0.009549,...,-0.005554,-0.002813,-0.001566,-0.005648,0.002843,0.003020,0.008535,0.000808,-0.000012,0.006023
2,-0.000692,0.000626,-0.004123,9.112752e-17,-0.004907,-0.005931,0.000722,0.000162,-0.000328,-0.000446,...,0.013396,-0.011277,0.007336,-0.025562,-0.016322,0.016656,0.005439,-0.016036,-0.009849,0.004479
3,-0.073662,-0.033545,0.023136,6.946809e-15,-0.036185,-0.008227,0.021582,-0.030511,0.022909,0.009363,...,-0.000615,-0.006833,-0.000322,-0.001454,0.000704,-0.000837,0.003476,0.001106,0.000476,-0.001213
4,-0.014650,0.004242,-0.028976,-1.848874e-15,-0.010656,-0.031477,0.008949,-0.002908,0.008714,-0.004331,...,-0.001326,-0.004734,0.004881,-0.000341,0.001290,0.000270,0.001956,0.000385,0.000150,-0.000525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303772,-0.003324,0.003373,-0.018664,4.712499e-16,-0.016093,-0.020208,0.005144,0.003430,-0.004483,-0.000571,...,0.017878,0.004712,0.015581,0.003491,0.003966,-0.033181,0.004723,-0.001732,0.004082,-0.015115
303773,-0.000146,0.000062,-0.000374,2.048702e-16,-0.000491,-0.000247,-0.000028,-0.000356,-0.000024,-0.000483,...,0.000141,0.000613,0.001933,0.001385,-0.000960,-0.001530,-0.000127,0.002363,-0.000764,-0.000444
303774,-0.007265,0.002759,-0.014219,-1.549514e-15,0.002058,-0.015691,-0.005324,-0.001660,0.003737,0.005356,...,-0.001798,-0.003622,-0.001894,-0.001807,-0.002271,0.002272,-0.003042,-0.002180,-0.001454,0.001712
303775,-0.000379,0.000570,-0.002305,3.044021e-17,-0.001858,-0.002593,0.000396,0.000081,-0.000332,0.000153,...,0.002172,0.002923,0.005600,-0.003898,-0.000849,0.011744,0.003051,-0.000776,0.004829,0.002461


In [93]:
temp.to_feather('drive/MyDrive/spb/svd_embeddings_urls.ftr')