In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
model_path = '/kaggle/working/foursquare.pth'

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer
from transformers import BertModel
import math

def distance(lat1, lon1, lat2, lon2):
    if lat1 != lat1: lat1 = 0
    if lon1 != lon1: lon1 = 0
    if lat2 != lat2: lat2 = 0
    if lon2 != lon2: lon2 = 0
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)
    value = math.sin(lon1) * math.sin(lon2) + math.cos(lon1) * math.cos(lon2) * math.cos(lat2 - lat1)
    return abs(math.acos(np.clip(value, -1, 1))) * 100

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 1)
        self.linear2 = nn.Linear(3, 2)

    def forward(self, data_input, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear1_output = self.linear1(dropout_output)
        linear2_output = self.linear2(torch.cat([data_input, linear1_output], dim=1).float())

        return linear2_output

In [None]:
df_pairs = pd.read_csv('/kaggle/input/foursquare-location-matching/pairs.csv')
df_pairs

In [None]:
from torch.utils.data import DataLoader

N = 10000

df_train, df_val, df_test = np.split(df_pairs.sample(n=N, random_state=42), 
                                     [int(.8 * N), int(.9 * N)])

print(len(df_train),len(df_val), len(df_test))

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # use_cuda = False
    # device = 'cpu'

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    last_val_loss = torch.tensor(float("inf"))

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        with tqdm(train_dataloader) as pbar:
            for train_input, train_label in pbar:
                train_label = train_label.to(device)
                data_input = train_input[0].to(device)
                mask = train_input[1]['attention_mask'].to(device)
                input_id = train_input[1]['input_ids'].squeeze(1).to(device)

                output = model(data_input, input_id, mask)

                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label.argmax(dim=1)).int().sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
                pbar.set_postfix({'loss': batch_loss.item() / train_label.size()[0], 'acc': acc / train_label.size()[0]})
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                data_input = val_input[0].to(device)
                mask = val_input[1]['attention_mask'].to(device)
                input_id = val_input[1]['input_ids'].squeeze(1).to(device)

                output = model(data_input, input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label.argmax(dim=1)).int().sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}')
        
        if last_val_loss:
            print(f'save model {model_path} from val_loss {last_val_loss} to {total_loss_val / len(val_data): .3f}')
            torch.save(model.state_dict(), model_path)
            last_val_loss = total_loss_val / len(val_data)

In [None]:
EPOCHS = 20
LR = 1e-5

model = BertClassifier()
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))

train(model, df_train, df_val, LR, EPOCHS)

In [None]:
model = BertClassifier()
model.load_state_dict(torch.load(model_path))

def is_match(x1, x2):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    with torch.no_grad():
        dist = distance(x1['latitude'], x1['longitude'], x2['latitude'], x2['longitude'])
        country = x1['country'] == x2['country']
        text = tokenizer(
            '' if x1['name'] != x1['name'] else x1['name'] + '[SEP]' +
            '' if x1['categories'] != x1['categories'] else x1['categories'] + '[SEP]' +
            '' if x2['name'] != x2['name'] else x2['name'] + '[SEP]' +
            '' if x2['categories'] != x2['categories'] else x2['categories'] + '[SEP]',
            padding='max_length', max_length = 512, truncation=True, return_tensors='pt'
        )
        
        data_input = torch.tensor([[dist, country]]).to(device)
        mask = text['attention_mask'].to(device)
        input_id = text['input_ids'].squeeze(1).to(device)

        output = model(data_input, input_id, mask)
        
        return (output[0][0] <= output[0][1]).item() and country

df = pd.read_csv('/kaggle/input/foursquare-location-matching/test.csv')

df_match = pd.DataFrame(columns=['id', 'matches'])

for i in range(len(df.index)):
    df_match = df_match.append({'id': df.loc[i, 'id'], 'matches': df.loc[i, 'id']}, ignore_index=True)
    for j in range(len(df.index)):
        if i != j and is_match(df.loc[i,:], df.loc[j,:]):
            df_match.loc[i, 'matches'] += ' ' + df.loc[j, 'id']

df_match.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
# For offline model
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/bert-base-multilingual-cased