In [None]:
!pip install -U transformers
!pip install geopy
!pip install fugashi[unidic-lite]
!pip install ipadic
!pip install soyclustering

In [None]:
import os
import torch
import pandas as pd
import numpy as np
import math
from geopy.distance import geodesic
from google.colab import drive
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, BertTokenizerFast, PreTrainedTokenizerFast, EncoderDecoderModel
from scipy.sparse import csr_matrix

In [None]:
drive.mount('/content/drive')

In [None]:
airbnb_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/airbnb_data_sample.csv')
oshima_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/oshimaland_dataset_final.csv')
open_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/open_data_sample.csv')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

----

### 일본어 -> 한국어 번역

In [None]:
def translate(text_src):
    embeddings = src_tokenizer(text_src, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
    embeddings = {k: v for k, v in embeddings.items()}
    output = model.generate(**embeddings)[0, 1:-1]
    text_trg = trg_tokenizer.decode(output.cpu())
    return text_trg

In [None]:
encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"

src_tokenizer = BertTokenizerFast.from_pretrained(encoder_model_name)
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name)
model = EncoderDecoderModel.from_pretrained("figuringoutmine/translator-for-travel-jp-to-kr").to(device)

In [None]:
translated = list()
with torch.no_grad():
    for info in oshima_data['info'].values:
        encoded = src_tokenizer(info, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
        encoded = {k:v.to(device) for k, v in encoded.items()}
        output = model.generate(**encoded)[0, 1:-1]
        translated.append(trg_tokenizer.decode(output.cpu()))

In [None]:
oshima_data['translated'] = translated

### Bert Embedding Base Clustering

In [None]:
from soyclustering import SphericalKMeans

In [None]:
model = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese')
tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')

In [None]:
embeddings = list()
with torch.no_grad():
    for info in oshima_data['info'].values:
        encoded = tokenizer(info, return_tensors='pt')
        output = model(**encoded)
        embedding = output.pooler_output[0].detach().cpu().numpy().tolist()
        embeddings.append(embedding)
embeddings = csr_matrix(embeddings)

In [None]:
spherical_kmeans = SphericalKMeans(
    n_clusters=5,    #  5개의 군집으로 군집화
    max_iter=100,    # 최대 Iteration 수
    verbose=1,
    init='similar_cut'
)

In [None]:
labels = spherical_kmeans.fit_predict(embeddings)

In [None]:
oshima_kmeans = oshima_data.copy()
oshima_kmeans['labels'] = labels

In [None]:
oshima_kmeans[oshima_kmeans['labels']==0].sample(10)    # 투신자살..?

In [None]:
oshima_kmeans[oshima_kmeans['labels']==1].sample(10)    # 자살

In [None]:
oshima_kmeans[oshima_kmeans['labels']==2].sample(10)    # 사고사 / 고독사

In [None]:
oshima_kmeans[oshima_kmeans['labels']==3].sample(10)    # 화재 / 투신 자살

In [None]:
oshima_kmeans[oshima_kmeans['labels']==4].sample(10)    # 칼...?

----

### 각 숙소별 반경 nKm 내 사고 발생 건수

In [None]:
def get_distances(locs_1, locs_2):
    n_rows_1 = locs_1.shape[0]
    n_rows_2 = locs_2.shape[0]
    dists = np.empty((n_rows_1, n_rows_2))
    # The loops here are inefficient
    for i in range(n_rows_1):
        for j in range(n_rows_2):
            dists[i, j] = geodesic(locs_1[i], locs_2[j]).km
    return dists

In [None]:
oshima_with_location = pd.read_csv(os.path.join(data_path, 'oshima_data_sample.csv'))

In [None]:
crime_locations = oshima_with_location[['latitude', 'longitude']].values

In [None]:
# 에어비앤비 데이터 사용하여 사고 건수 계산
airbnb_locations = airbnb_data[['latitude', 'longitude']].values

In [None]:
distance_matrix = get_distances(airbnb_locations, crime_locations)

In [None]:
n_km = 3    # 3km 이내 범죄 발생 건수

In [None]:
n_crimes = np.sum((distance_matrix < n_km), axis=1)

In [None]:
airbnb_data['n_crimes'] = n_crimes

In [None]:
airbnb_data