In [4]:
import pandas as pd
from transformers import BertTokenizer, TFBertModel, BertConfig
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import tensorflow as tf
import transformers
print("TensorFlow version:", tf.__version__)
print("Transformers version:", transformers.__version__)
model_name = "bert-base-chinese"

TensorFlow version: 2.7.0
Transformers version: 4.30.2


In [6]:
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

Downloading vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading (…)ir-Id=K3RPWS32NSSJCE:   0%|          | 0.00/412M [00:00<?, ?B/s]

2025-03-03 21:39:37.291744: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-03-03 21:39:37.411697: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-03-03 21:39:37.412122: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-03-03 21:39:37.415382: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [7]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='tf', max_length=512, truncation=True, padding='max_length')
    outputs = bert_model.bert(inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embedding.squeeze()

In [8]:
df = pd.read_csv('../clean_data/lyrics_clean.csv')
city_names = df['city'].tolist()
texts = df['content'].tolist()

In [9]:
# for bert
texts = [i.replace(' ', '') for i in texts]

In [10]:
embeddings = np.array([get_bert_embedding(text) for text in texts])
print(embeddings)
cosine_sim = cosine_similarity(embeddings)
cosine_sim = np.round(cosine_sim, 4)
print('====similarities=====')
print(cosine_sim)

[[ 0.90331715 -0.17201401  0.06823187 ... -0.70175636  0.5859843
  -0.36856073]
 [ 0.37233388  0.22014777 -1.1495106  ... -0.3151221  -0.08851374
  -0.41829032]
 [ 0.8583468  -0.34119886 -0.9801506  ... -0.660444    0.09695947
  -0.21728826]
 ...
 [ 0.926067    0.08867551 -0.29929233 ...  0.039749    0.5902205
  -0.08975361]
 [ 1.1440822   0.66263145 -0.6485996  ...  0.65169895 -0.07311955
  -0.32334164]
 [ 0.63165754 -0.31760013 -0.27759105 ... -0.5521531   0.38353455
  -0.52129334]]
====similarities=====
[[1.     0.6448 0.8839 ... 0.8446 0.8002 0.8326]
 [0.6448 1.     0.6461 ... 0.665  0.7188 0.7503]
 [0.8839 0.6461 1.     ... 0.8488 0.7501 0.8397]
 ...
 [0.8446 0.665  0.8488 ... 1.     0.7494 0.8191]
 [0.8002 0.7188 0.7501 ... 0.7494 1.     0.7798]
 [0.8326 0.7503 0.8397 ... 0.8191 0.7798 1.    ]]


In [11]:
df_sim = pd.DataFrame(cosine_sim, columns=city_names, index=city_names)
df_sim.to_csv('./city_lyrics_sim.csv')
print(df_sim)

            唐山市    秦皇岛市     承德市    石家庄市     廊坊市     衡水市    张家口市     保定市  \
唐山市      1.0000  0.6448  0.8839  0.9367  0.8062  0.8459  0.8443  0.9115   
秦皇岛市     0.6448  1.0000  0.6461  0.6788  0.7398  0.6825  0.6928  0.7312   
承德市      0.8839  0.6461  1.0000  0.8883  0.8026  0.8897  0.8834  0.8774   
石家庄市     0.9367  0.6788  0.8883  1.0000  0.8346  0.8679  0.8522  0.9270   
廊坊市      0.8062  0.7398  0.8026  0.8346  1.0000  0.7953  0.8339  0.8561   
...         ...     ...     ...     ...     ...     ...     ...     ...   
澳门特别行政区  0.8270  0.6853  0.7917  0.8357  0.7911  0.8149  0.8064  0.8648   
北京市      0.8829  0.7373  0.8422  0.8849  0.8540  0.8227  0.8602  0.8975   
天津市      0.8446  0.6650  0.8488  0.8502  0.7782  0.8522  0.8185  0.8318   
上海市      0.8002  0.7188  0.7501  0.8084  0.7644  0.7979  0.7469  0.8299   
重庆市      0.8326  0.7503  0.8397  0.8523  0.7968  0.7955  0.8093  0.8435   

            邢台市     邯郸市  ...  昌吉回族自治州  博尔塔拉蒙古自治州  克孜勒苏柯尔克孜自治州  伊犁哈萨克自治州  \
唐山市      0.8754  0.9092 