# Import

In [4]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 28.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 79.0 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 97.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.7 MB/s 
Building wheels for collected p

In [2]:
from psutil import virtual_memory
virtual_memory().available / 1e9

53.42810112

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

from tqdm import tqdm

import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

import pickle

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Load data

In [13]:
train = pd.read_csv('drive/MyDrive/spb/train.csv', usecols=['DEF'])
test = pd.read_feather('drive/MyDrive/spb/test.ftr')

# Work with tokens

## Choice representative tokens

In [14]:
tokens_data_agg = pd.read_feather('drive/MyDrive/spb/tokens_data_agg.ftr')

In [15]:
ratio_default = train[train.DEF == 1].shape[0] / train[train.DEF == 0].shape[0]
ratio_shift = 0.25
all_counts_th = tokens_data_agg.all_counts.median()

tokens_data_agg_choosen = tokens_data_agg[(tokens_data_agg.all_counts >= all_counts_th)
                                          &((tokens_data_agg['0_1_ratio'] < ratio_default - ratio_shift*ratio_default)
                                          |(tokens_data_agg['0_1_ratio'] > ratio_default + ratio_shift*ratio_default))]

In [16]:
good_tokens_data_agg_choosen = set(tokens_data_agg_choosen['items'].values)

In [17]:
tokens_processed = []
ratio_mean_0_1 = []

for x in tqdm(test.tokens):
    if type(x) == str:
        x = list(set(x.split(' ')) & good_tokens_data_agg_choosen)
        ratio_mean_0_1.append(tokens_data_agg_choosen[tokens_data_agg_choosen['items'].isin(x)]['0_1_ratio'].mean())
        tokens_processed.append(' '.join(x))
    else:
        ratio_mean_0_1.append(ratio_default)
        tokens_processed.append('')

100%|██████████| 154804/154804 [04:06<00:00, 626.94it/s]


In [18]:
temp = pd.DataFrame()
temp['tokens_processed'] = tokens_processed
temp['tokens_ratio_mean_0_1'] = ratio_mean_0_1
temp.to_feather('drive/MyDrive/spb/test_processed.ftr')

## Transformer embeddings on sentences

In [20]:
test_data = pd.read_feather('drive/MyDrive/spb/test_processed.ftr')

In [21]:
embed_model = SentenceTransformer('LaBSE')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

In [23]:
embeddings = embed_model.encode(test_data.tokens_processed.values,
                                show_progress_bar=True)

Batches:   0%|          | 0/4838 [00:00<?, ?it/s]

In [24]:
temp = pd.DataFrame()
temp[['labse_'+str(i) for i in range(embeddings.shape[1])]] = embeddings

  self[col] = igetitem(value, i)


In [25]:
temp.to_feather('drive/MyDrive/spb/test_LaBSE_embeddings.ftr')

## Transformer embeddings on selected words (TOO LONG)

In [None]:
# embed_model = SentenceTransformer('DeepPavlov/distilrubert-base-cased-conversational')

In [None]:
# word_embeddings = []

# for x in tqdm(train_data.tokens_processed):
#     word_embeddings.append(np.mean(embed_model.encode(x.split(' ')), axis=0))

## SVD features

### Functions

In [49]:
def transform_indices(data, users, items):
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        idx, idx_map = to_numeric_id(data, field)
        data_index[entity] = idx_map
        data.loc[:, field] = idx
    return data, data_index

def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def matrix_from_data(data, data_description, dtype=None):
    '''
    Converts pandas DataFrame into sparse CSR matrix.
    Assumes data in the DataFrame is alread normalized via `transform_indices`.
    '''
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback_data = data_description.get('feedback', None)
    if feedback_data is not None:
        feedback = data[feedback_data].values
    else:
        feedback = np.ones(len(user_idx))
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    return csr_matrix((feedback, (user_idx, item_idx)), shape=shape, dtype=dtype)


def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_data(data, data_description).asfptype()
    _, s, vt = svds(source_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, singular_values

### Prepare data and make SVD

In [68]:
test_data = pd.read_feather('drive/MyDrive/spb/test_processed.ftr')

In [69]:
with open('drive/MyDrive/spb/tokens_data_index.pkl', 'rb') as f:
    tokens_data_index = pickle.load(f)

In [70]:
with open('drive/MyDrive/spb/tokens_svd_params', 'rb') as f:
    svd_params = pickle.load(f)

item_factors, singvals = svd_params

In [71]:
svd_embeddings_tokens = []

for x in tqdm(test_data.tokens_processed):
    svd_embeddings_tokens.append(np.mean(item_factors[np.where(tokens_data_index['items'].isin(x.split(' ')))[0]], axis=0))

100%|██████████| 154804/154804 [02:42<00:00, 950.72it/s]


In [72]:
temp = pd.DataFrame()
temp[['token_svd'+str(i) for i in range(512)]] = svd_embeddings_tokens

In [73]:
temp.to_feather('drive/MyDrive/spb/test_svd_embeddings_tokens.ftr')

# Load data

In [34]:
train = pd.read_csv('drive/MyDrive/spb/train.csv', usecols=['DEF'])
test = pd.read_feather('drive/MyDrive/spb/test.ftr').drop('tokens', axis=1)

# Work with urls

## Choice representative tokens

In [35]:
urls_data_agg = pd.read_feather('drive/MyDrive/spb/urls_data_agg.ftr')

In [36]:
ratio_default = train[train.DEF == 1].shape[0] / train[train.DEF == 0].shape[0]
ratio_shift = 0.25
all_counts_th = urls_data_agg.all_counts.quantile(0.9)

urls_data_agg_choosen = urls_data_agg[(urls_data_agg.all_counts >= all_counts_th)
                                          &((urls_data_agg['0_1_ratio'] < ratio_default - ratio_shift*ratio_default)
                                          |(urls_data_agg['0_1_ratio'] > ratio_default + ratio_shift*ratio_default))]

In [37]:
good_urls_data_agg_choosen = set(urls_data_agg_choosen['items'].values)

In [38]:
urls_processed = []
ratio_mean_0_1 = []

for x in tqdm(test.urls_hashed):
    if type(x) == str:
        x = list(set(x.split(' ')) & good_urls_data_agg_choosen)
        ratio_mean_0_1.append(urls_data_agg_choosen[urls_data_agg_choosen['items'].isin(x)]['0_1_ratio'].mean())
        urls_processed.append(' '.join(x))
    else:
        ratio_mean_0_1.append(ratio_default)
        urls_processed.append('')

100%|██████████| 154804/154804 [04:56<00:00, 522.35it/s]


In [39]:
temp = pd.DataFrame()
temp['urls_processed'] = urls_processed
temp['urls_ratio_mean_0_1'] = ratio_mean_0_1
temp.to_feather('drive/MyDrive/spb/urls_test_processed.ftr')

## SVD features

### Functions

In [59]:
def transform_indices(data, users, items):
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        idx, idx_map = to_numeric_id(data, field)
        data_index[entity] = idx_map
        data.loc[:, field] = idx
    return data, data_index

def to_numeric_id(data, field):
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def matrix_from_data(data, data_description, dtype=None):
    '''
    Converts pandas DataFrame into sparse CSR matrix.
    Assumes data in the DataFrame is alread normalized via `transform_indices`.
    '''
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback_data = data_description.get('feedback', None)
    if feedback_data is not None:
        feedback = data[feedback_data].values
    else:
        feedback = np.ones(len(user_idx))
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    return csr_matrix((feedback, (user_idx, item_idx)), shape=shape, dtype=dtype)


def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_data(data, data_description).asfptype()
    _, s, vt = svds(source_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, singular_values

### Prepare data and make SVD

In [60]:
test_data = pd.read_feather('drive/MyDrive/spb/urls_test_processed.ftr')

In [61]:
with open('drive/MyDrive/spb/urls_data_index.pkl', 'rb') as f:
    urls_data_index = pickle.load(f)

In [62]:
with open('drive/MyDrive/spb/urls_svd_params', 'rb') as f:
    svd_params = pickle.load(f)

item_factors, singvals = svd_params

In [65]:
svd_embeddings_urls = []

for x in tqdm(test_data.urls_processed):
    svd_embeddings_urls.append(np.mean(item_factors[np.where(urls_data_index['items'].isin(x.split(' ')))[0]], axis=0))

100%|██████████| 154804/154804 [03:56<00:00, 654.69it/s]


In [66]:
temp = pd.DataFrame()
temp[['urls_svd'+str(i) for i in range(512)]] = svd_embeddings_urls

  self[col] = igetitem(value, i)


In [67]:
temp.to_feather('drive/MyDrive/spb/test_svd_embeddings_urls.ftr')