In [1]:
import pandas as pd
import s3fs
import boto3
from io import StringIO # python3; python2: BytesIO 
from boto3.s3.transfer import TransferConfig
import torch
from transformers import *
import numpy as np
import ast
import time

In [None]:
model = BertModel.from_pretrained('bert-base-multilingual-cased').cuda(1)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [None]:
user_tokens = pd.read_csv('s3://recsys-challenge-2020/user_tokens_like.csv')
column_of_interest = ["text_ tokens", "engaging_user_id"]
val_set = pd.read_csv('s3://recsys-challenge-2020/val.tsv', encoding="utf-8",
                     usecols= [0, 14], names=column_of_interest, sep="\x01")
print('loaded datasets')

In [None]:
user_tokens_val_set = \
pd.merge(val_set, user_tokens, how = 'left', left_on = 'engaging_user_id', right_on = 'engaging_user_id', sort=False)
user_tokens_val_set.columns = [c.replace(' ', '_') for c in user_tokens_val_set.columns]
print('join completed')
print('number of rows for which score needs to be computed: ' + str(len(user_tokens_val_set)))

In [None]:
def calculate_average(row1, row2, index, time1):
    with torch.cuda.device(1):
        torch.cuda.empty_cache()
        prior = 0.439
        if index % 1000 == 0:
            print(index)
            print(time.time() - time1)
        if pd.isna(row1):
            return prior

        sum_tensors = torch.zeros([768], dtype=torch.float32).cuda()
        tweet_token_list = ast.literal_eval(row1)
        len_row1 = len(row1)

        for token_list in tweet_token_list:
            list_of_tokens = list(map(int, token_list.split('\t')))
            if len(list_of_tokens) > 512:
                list_of_tokens = list_of_tokens[:511]
            tensor_tokens = torch.tensor(list_of_tokens).cuda()
            tensor_tokens_unsqueeze = tensor_tokens.unsqueeze(0).cuda()
            model_tokens  = model(tensor_tokens_unsqueeze)

            model_tokens_1d = model_tokens[0].cuda()
            model_tokens_2d = model_tokens_1d[0].cuda()
            token_list_embeddings =  model_tokens_2d[0].cuda()
            sum_tensors = (sum_tensors + token_list_embeddings).cuda()

        avg = (sum_tensors/len_row1).cuda()
        p_user_avg_embedding = (avg / torch.norm(avg).cuda()).cuda()

        list_of_tokens_val = list(map(int, row2.split('\t')))
        if len(list_of_tokens_val) > 512:
            list_of_tokens_val = list_of_tokens_val[:511]

        tensor_tokens_val = torch.tensor(list_of_tokens_val).cuda()
        tensor_tokens_unsqueeze_val = tensor_tokens_val.unsqueeze(0).cuda()
        model_tokens_val = model(tensor_tokens_unsqueeze_val)
        model_tokens_1d_val = model_tokens_val[0].cuda()
        model_tokens_2d_val = model_tokens_1d_val[0].cuda()
        tweet_embedding = model_tokens_2d_val[0].cuda()

        tweet_average_embedding = (tweet_embedding / torch.norm(tweet_embedding).cuda()).cuda()

        P_B_given_A = torch.dot(p_user_avg_embedding, tweet_average_embedding).cuda()

    #     posterior = (likelihood*prior)  /  ((likelihood*prior) + ((1-likelihood)*(1-prior)))

    #     P(A|B) = P(B|A) * P(A) / P(B)

    #     P(B) = P(B/A)*P(A) + P(B/~A)* P(~A)

        num = (P_B_given_A * prior).cuda()


        unlikelihood = (1.0 - P_B_given_A).cuda() # P(B/~A)
        anti_score = (unlikelihood * 0.561).cuda()
        normalizing_factor = (num + anti_score).cuda() # P(B)


        posterior = (num / normalizing_factor).cuda()

        return posterior.cpu().detach().numpy().item(0)

In [None]:
user_val_set_like_score = pd.DataFrame()

In [None]:
time1 = time.time()
user_val_set_like_score['like_score'] = user_tokens_val_set.apply \
(lambda z: calculate_average(z.text__tokens_y, z.text__tokens_x, z.name, time1), axis = 1)
time2 = time.time()
print(time2 - time1)

In [None]:
user_val_set_like_score.to_csv('s3://recsys-challenge-2020/user_like_score.csv', index = False)