In [1]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 讀取CSV檔案
df = pd.read_csv('items_colors.csv')
# df['match']=df.shape[0]*[1]

In [3]:
labels = torch.tensor(df['match'].tolist(), dtype=torch.float)

In [4]:

# 將物品和顏色轉換為BERT的輸入
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
item_tokens = tokenizer(df['item'].tolist(), padding=True, truncation=True, max_length=16, return_tensors='pt')
color_tokens = tokenizer(df['color'].tolist(), padding=True, truncation=True, max_length=8, return_tensors='pt')


In [16]:
item_tokens[0]

KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'

In [5]:

# 將BERT輸入轉換為BERT輸出
bert_model = BertModel.from_pretrained('bert-base-uncased')
item_outputs = bert_model(**item_tokens)[0]
color_outputs = bert_model(**color_tokens)[0]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:

# 將BERT輸出作為LSTM輸入
class ColorPredictor(nn.Module):
    def __init__(self):
        super(ColorPredictor, self).__init__()
        self.lstm = nn.LSTM(input_size=768, hidden_size=128, num_layers=2, batch_first=True)
        self.fc = nn.Linear(in_features=128, out_features=1)
        self.sigmoid = torch.nn.Sigmoid() 
    def forward(self, item_outputs, color_outputs):
        inputs = torch.cat([item_outputs, color_outputs], dim=1)
        _, (hn, cn) = self.lstm(inputs)
        outputs = self.fc(hn[-1])
        # output = self.sigmoid(outputs)
        return outputs


In [82]:
df['color'][i]


'red'

In [7]:


model = ColorPredictor()
# criterion = torch.nn.functional.binary_cross_entropy
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(200):
    running_loss = 0.0
    for i in range(0, len(df), 10):
        item_output = item_outputs[i:i+10].detach()
        color_output = color_outputs[i:i+10].detach()
        label = labels[i:i+10].unsqueeze(1)

        optimizer.zero_grad()
        output = model(item_output, color_output)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(df)))


Epoch 1 loss: 0.095
Epoch 2 loss: 0.071
Epoch 3 loss: 0.072
Epoch 4 loss: 0.072
Epoch 5 loss: 0.072
Epoch 6 loss: 0.072
Epoch 7 loss: 0.072
Epoch 8 loss: 0.072
Epoch 9 loss: 0.072
Epoch 10 loss: 0.072
Epoch 11 loss: 0.072
Epoch 12 loss: 0.072
Epoch 13 loss: 0.072
Epoch 14 loss: 0.071
Epoch 15 loss: 0.071
Epoch 16 loss: 0.071
Epoch 17 loss: 0.070
Epoch 18 loss: 0.070
Epoch 19 loss: 0.069
Epoch 20 loss: 0.068
Epoch 21 loss: 0.065
Epoch 22 loss: 0.062
Epoch 23 loss: 0.057
Epoch 24 loss: 0.053
Epoch 25 loss: 0.066
Epoch 26 loss: 0.051
Epoch 27 loss: 0.047
Epoch 28 loss: 0.040
Epoch 29 loss: 0.036
Epoch 30 loss: 0.032
Epoch 31 loss: 0.027
Epoch 32 loss: 0.023
Epoch 33 loss: 0.017
Epoch 34 loss: 0.014
Epoch 35 loss: 0.009
Epoch 36 loss: 0.008
Epoch 37 loss: 0.006
Epoch 38 loss: 0.006
Epoch 39 loss: 0.003
Epoch 40 loss: 0.002
Epoch 41 loss: 0.001
Epoch 42 loss: 0.001
Epoch 43 loss: 0.001
Epoch 44 loss: 0.001
Epoch 45 loss: 0.000
Epoch 46 loss: 0.000
Epoch 47 loss: 0.000
Epoch 48 loss: 0.000
E

In [55]:

# 測試模型
test_items = ['bat', 'coffee', 'sky', 'grape', 'lemon']
test_colors = ['red', 'brown', 'blue', 'purple', 'green']

for i in range(len(test_items)):
    test_item_tokens = tokenizer(test_items[i], padding=True, truncation=True, max_length=16, return_tensors='pt')
    test_color_tokens = tokenizer(test_colors[i], padding=True, truncation=True, max_length=8, return_tensors='pt')
    test_item_output = bert_model(**test_item_tokens)[0].detach()
    test_color_output = bert_model(**test_color_tokens)[0].detach()
    test_output = model(test_item_output, test_color_output)
    test_prediction = torch.sigmoid(test_output) > 0.5
    
    # 計算兩個向量的餘弦相似度
    item_vectors = item_outputs.detach().numpy()
    color_vectors = color_outputs.detach().numpy()
    cosine_similarities = cosine_similarity(item_vectors, color_vectors)

    print(cosine_similarities)

ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.

In [56]:
item_vectors

array([[[-0.2069165 ,  0.08809706, -0.19849141, ..., -0.10356671,
          0.08773239,  0.15185225],
        [-0.49911833, -0.0348646 , -0.6298923 , ...,  0.4416717 ,
          0.08368811, -0.8565332 ],
        [ 0.89667416,  0.1536845 , -0.3115343 , ...,  0.09887315,
         -0.693995  , -0.3875741 ],
        [-0.05032158,  0.3333085 , -0.01176501, ...,  0.33261266,
          0.09453274,  0.12265699],
        [-0.29818192, -0.10951727, -0.23225972, ...,  0.30622137,
          0.22214079,  0.06901016],
        [-0.14767048,  0.0456373 , -0.07858583, ...,  0.23744729,
          0.15700723,  0.11565822]],

       [[-0.21428354,  0.03756715, -0.2317439 , ..., -0.08002454,
          0.13585404,  0.01386099],
        [-0.58986866, -0.5431918 , -0.6305308 , ...,  0.5202858 ,
          0.4029937 , -0.4703741 ],
        [ 0.93898654,  0.16755196, -0.26549673, ...,  0.07251855,
         -0.77950716, -0.35671908],
        [-0.3460751 ,  0.03673025, -0.03448339, ...,  0.12769046,
          0.21

In [9]:
sentence_model = SentenceTransformer('bert-base-uncased')

No sentence-transformers model found with name C:\Users\Dylan.Tang/.cache\torch\sentence_transformers\bert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\Dylan.Tang/.cache\torch\sentence_transformers\bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassif

In [18]:

# 測試模型
test_items = ['tree', 'watercress', 'black tea', 'grape', 'white board']
test_colors = ['green', 'pink', 'green', 'green', 'blue']

for i in range(len(test_items)):
    test_item_tokens = tokenizer(test_items[i], padding=True, truncation=True, max_length=16, return_tensors='pt')
    test_color_tokens = tokenizer(test_colors[i], padding=True, truncation=True, max_length=8, return_tensors='pt')
    test_item_output = bert_model(**test_item_tokens)[0]
    test_color_output = bert_model(**test_color_tokens)[0]
    test_output = model(test_item_output, test_color_output)
    test_prediction = torch.sigmoid(test_output) > 0.5

    # 計算兩個向量的餘弦相似度
    vector_a = sentence_model.encode(test_items[i])
    vector_b = sentence_model.encode(test_colors[i])
    cosine_similarity = np.dot(vector_a, vector_b) / (np.linalg.norm(vector_a) * np.linalg.norm(vector_b))


    
    print('%s: %s (%s)' % (test_items[i], test_colors[i], 'correct' if test_prediction.item() else 'incorrect'))
    print("兩個向量之間的餘弦相似度：", cosine_similarity)

tree: green (incorrect)
兩個向量之間的餘弦相似度： 0.7305856
watercress: pink (incorrect)
兩個向量之間的餘弦相似度： 0.58263755
black tea: green (incorrect)
兩個向量之間的餘弦相似度： 0.7028487
grape: green (incorrect)
兩個向量之間的餘弦相似度： 0.71846133
white board: blue (incorrect)
兩個向量之間的餘弦相似度： 0.752831
