In [1]:
!gdown --id "1Vh9rV1ctXVCrg3Zw0nVbUsvjtt5Re5qK" -O /content/your_data.zip

!unzip /content/your_data.zip -d /content/extracted_data

Downloading...
From: https://drive.google.com/uc?id=1Vh9rV1ctXVCrg3Zw0nVbUsvjtt5Re5qK
To: /content/your_data.zip
100% 26.0M/26.0M [00:00<00:00, 127MB/s]
Archive:  /content/your_data.zip
   creating: /content/extracted_data/img_small/
  inflating: /content/extracted_data/__MACOSX/._img_small  
  inflating: /content/extracted_data/img_small/4ca7f99294e090c370dcd8b96de5ed394c4a4f87_2.jpg  
  inflating: /content/extracted_data/__MACOSX/img_small/._4ca7f99294e090c370dcd8b96de5ed394c4a4f87_2.jpg  
  inflating: /content/extracted_data/img_small/e98db57d65cc4f4381589f83dddaaea052e4fcd3_1.jpg  
  inflating: /content/extracted_data/__MACOSX/img_small/._e98db57d65cc4f4381589f83dddaaea052e4fcd3_1.jpg  
  inflating: /content/extracted_data/img_small/de4373dc886db5aa8bbf9e56daf20714b24a22fa_4.jpg  
  inflating: /content/extracted_data/__MACOSX/img_small/._de4373dc886db5aa8bbf9e56daf20714b24a22fa_4.jpg  
  inflating: /content/extracted_data/img_small/01f3d382064ea8fed7603ee74361d632e3b13120_1.jpg  
 

In [2]:
import pandas as pd
import os
import torch
from PIL import Image
from torchvision import transforms
from transformers import BartTokenizer, BartForConditionalGeneration, VisualBertModel
from torchvision.models import resnet50
data = pd.read_csv('train_small.csv')

data['article'] = data['article'].apply(lambda x: x.strip().lower())


### **文字特徵提取**

In [3]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
bart_model.eval()

def extract_text_features(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = bart_model(**inputs)
    return outputs.encoder_last_hidden_state.squeeze(0).mean(dim=0).numpy()

data['text_features'] = data['article'].apply(extract_text_features)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

### **圖像特徵提取**

In [4]:
# 多圖片特徵提取
import os
import torch
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm

resnet = models.resnet50(pretrained=True)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(image)
    return features.squeeze(0)

def process_images(file_name):
    images_dir = f'extracted_data/img_small/{file_name}_'
    features_list = []
    i = 1
    while os.path.exists(f'{images_dir}{i}.jpg'):
        features = extract_features(f'{images_dir}{i}.jpg')
        features_list.append(features)
        i += 1
    return torch.stack(features_list).mean(0)

data['image_features'] = data['fileName'].apply(process_images)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 100MB/s]


### **VisualBERT文字與圖像特徵結合**

In [5]:
import torch.nn as nn
from transformers import VisualBertModel
visual_bert = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')

feature_size = 768
text_feature_transform = nn.Linear(1024, feature_size)
image_feature_transform = nn.Linear(1000, feature_size)


def combine_features_and_process(row):

    text_features = torch.tensor(row['text_features']).unsqueeze(0)
    image_features = torch.tensor(row['image_features']).unsqueeze(0)

    text_features = text_feature_transform(text_features)
    image_features = image_feature_transform(image_features)

    text_features = text_features.unsqueeze(1)
    image_features = image_features.unsqueeze(1)

    combined_features = torch.cat((text_features, image_features), dim=1)  # [1, 2, 768]

    inputs = {'inputs_embeds': combined_features}

    with torch.no_grad():
        outputs = visual_bert(**inputs)
        processed_features = outputs.last_hidden_state.mean(dim=1)

    return processed_features.squeeze().tolist()

data['combined_features'] = data.apply(combine_features_and_process, axis=1)



config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/448M [00:00<?, ?B/s]

  image_features = torch.tensor(row['image_features']).unsqueeze(0)


### **生成摘要**

In [6]:
from transformers import BartForConditionalGeneration, BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

def generate_summary(combined_features, input_text, tokenizer, model):
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").input_ids

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_length=150, num_beams=5, early_stopping=True)

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

def process_and_generate_summary(row):
    summary = generate_summary(row['combined_features'], row['article'], tokenizer, bart_model)
    return summary


data['summary'] = data.apply(process_and_generate_summary, axis=1)


In [7]:
data

Unnamed: 0,id,fileName,article,summary,text_features,image_features,combined_features
0,1,3a248cb770b4d6fe080f7f95a5d595895d2be08d,"if you listen to twitter , kendall and kylie j...",Kendall + kylie clothing brand released an ima...,"[0.021653915, -0.11024898, -0.0676152, -0.0596...","[tensor(-0.9112), tensor(-1.3345), tensor(-0.7...","[-0.5087100863456726, 0.6678800582885742, 0.13..."
1,2,ddebde43f3ea7f7b337458d2343bfa9c588f9314,a new jersey family is reeling tuesday after a...,A new jersey family is reeling t Tuesday after...,"[0.10130456, -0.23067634, -0.03522276, -0.0583...","[tensor(0.5088), tensor(-0.8531), tensor(0.120...","[-0.6280848979949951, 0.6786677837371826, -0.1..."
2,3,39bde3d0752da393151a88b5fd6e3e31420da5d3,the bank of england is set to launch a crackdo...,Bank of england set to launch a crackdown this...,"[0.0022704469, -0.12262584, -0.36360878, -0.06...","[tensor(-0.8591), tensor(-1.0623), tensor(-3.0...","[0.5949293375015259, 0.2056165337562561, -0.07..."
3,4,efa330032f8ec3542641cde4987b4586387d8d8c,russian president vladimir putin is fond of ol...,Russian president vladimir putin is fond of ol...,"[0.13126874, -0.1570257, -0.09135373, -0.02925...","[tensor(-0.7379), tensor(-1.1619), tensor(0.41...","[-0.6749867796897888, 0.4144148826599121, 0.05..."
4,5,244f37a23707ae549d2b593179444aaeb9c95578,it is 26 years since europe 's most prestigiou...,It is 26 years since europe's most prestigious...,"[0.10788613, -0.16137244, -0.06229935, -0.0389...","[tensor(-0.2991), tensor(-1.1571), tensor(0.52...","[-0.469675213098526, 0.6555590629577637, -0.10..."
5,6,0936f771136a2c42b1964bdec9dea77ed08a3ec3,tamara taylor is chomping at the bit to appear...,tamara taylor is chomping at the bit to appear...,"[0.04333487, -0.11163817, -0.270213, -0.076347...","[tensor(-1.7187), tensor(-1.6331), tensor(-1.4...","[-0.1837862879037857, 0.4726242423057556, -0.4..."
6,7,ccc07dc8f51660232297409a6857eb98c02f00fc,they could become one of the premier league 's...,The re-signing of zlatan ibrahimovic on a one-...,"[0.1055755, -0.12388456, -0.039798126, -0.0552...","[tensor(-0.9841), tensor(-1.0619), tensor(0.09...","[-0.22483065724372864, 0.7355657815933228, 0.1..."
7,8,b59e86169bfc81b8d87abcde78f680118cfd3e6e,two cousins charged with the brutal deaths of ...,"cosmo dinardo, a 20-year-old drug dealer with ...","[0.10521533, -0.13344398, -0.0568666, -0.02211...","[tensor(1.5255), tensor(0.2995), tensor(-0.374...","[-1.061955451965332, 0.44963696599006653, -0.0..."
8,9,ccf503f94bff06e5b3028321ead8c227ec1230b2,former democratic national committee chair rep...,Former democratic national committee chair rep...,"[0.061565414, -0.068867505, -0.1003509, -0.038...","[tensor(-0.7756), tensor(-0.9958), tensor(0.28...","[-0.5605597496032715, 0.6661292910575867, 0.11..."
9,10,194cdfcb73a1d9c5b4a17763c4df87d8cdbe1c45,a disgusted easyjet passenger has complained a...,A disgusted easyjet passenger has complained a...,"[0.033479735, -0.11282287, -0.19271486, -0.038...","[tensor(-0.0837), tensor(-0.2329), tensor(-0.4...","[-0.6560893058776855, 0.6043345928192139, 0.04..."


### **評估指標**

In [None]:
from rouge import Rouge


rouge = Rouge()


rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []


for index, row in data.iterrows():
    scores = rouge.get_scores(row['summary'], row['article'])
    rouge_1_scores.append(scores[0]['rouge-1']['f'])
    rouge_2_scores.append(scores[0]['rouge-2']['f'])
    rouge_l_scores.append(scores[0]['rouge-l']['f'])

avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print(f"Average ROUGE-1 Score: {avg_rouge_1}")
print(f"Average ROUGE-2 Score: {avg_rouge_2}")
print(f"Average ROUGE-L Score: {avg_rouge_l}")

### **加載預訓練模型**

In [8]:
import os
import torch
from torchvision import models, transforms
from PIL import Image
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

resnet = models.resnet50(pretrained=True)
resnet.eval()


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_model.eval()


transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

### **特徵提取函數**

In [9]:
from sklearn.decomposition import PCA

def extract_image_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(image)
    return features.cpu().numpy().flatten()[:768]


def extract_text_features(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()



### **找出與summary最符合的圖片**

In [11]:
def find_most_relevant_image(file_name, summary):
    base_path = f'extracted_data/img_small/{file_name}_'
    text_features = extract_text_features(summary).flatten()
    best_image = None
    best_similarity = -1

    i = 1
    while os.path.exists(f'{base_path}{i}.jpg'):
        img_path = f'{base_path}{i}.jpg'
        image_features = extract_image_features(img_path)
        similarity = cosine_similarity([text_features], [image_features])[0][0]

        if similarity > best_similarity:
            best_similarity = similarity
            best_image = img_path

        i += 1

    return best_image


In [12]:
data['most_relevant_image'] = data.apply(lambda x: find_most_relevant_image(x['fileName'], x['summary']), axis=1)

In [13]:
data

Unnamed: 0,id,fileName,article,summary,text_features,image_features,combined_features,most_relevant_image
0,1,3a248cb770b4d6fe080f7f95a5d595895d2be08d,"if you listen to twitter , kendall and kylie j...",Kendall + kylie clothing brand released an ima...,"[0.021653915, -0.11024898, -0.0676152, -0.0596...","[tensor(-0.9112), tensor(-1.3345), tensor(-0.7...","[-0.5087100863456726, 0.6678800582885742, 0.13...",extracted_data/img_small/3a248cb770b4d6fe080f7...
1,2,ddebde43f3ea7f7b337458d2343bfa9c588f9314,a new jersey family is reeling tuesday after a...,A new jersey family is reeling t Tuesday after...,"[0.10130456, -0.23067634, -0.03522276, -0.0583...","[tensor(0.5088), tensor(-0.8531), tensor(0.120...","[-0.6280848979949951, 0.6786677837371826, -0.1...",extracted_data/img_small/ddebde43f3ea7f7b33745...
2,3,39bde3d0752da393151a88b5fd6e3e31420da5d3,the bank of england is set to launch a crackdo...,Bank of england set to launch a crackdown this...,"[0.0022704469, -0.12262584, -0.36360878, -0.06...","[tensor(-0.8591), tensor(-1.0623), tensor(-3.0...","[0.5949293375015259, 0.2056165337562561, -0.07...",extracted_data/img_small/39bde3d0752da393151a8...
3,4,efa330032f8ec3542641cde4987b4586387d8d8c,russian president vladimir putin is fond of ol...,Russian president vladimir putin is fond of ol...,"[0.13126874, -0.1570257, -0.09135373, -0.02925...","[tensor(-0.7379), tensor(-1.1619), tensor(0.41...","[-0.6749867796897888, 0.4144148826599121, 0.05...",extracted_data/img_small/efa330032f8ec3542641c...
4,5,244f37a23707ae549d2b593179444aaeb9c95578,it is 26 years since europe 's most prestigiou...,It is 26 years since europe's most prestigious...,"[0.10788613, -0.16137244, -0.06229935, -0.0389...","[tensor(-0.2991), tensor(-1.1571), tensor(0.52...","[-0.469675213098526, 0.6555590629577637, -0.10...",extracted_data/img_small/244f37a23707ae549d2b5...
5,6,0936f771136a2c42b1964bdec9dea77ed08a3ec3,tamara taylor is chomping at the bit to appear...,tamara taylor is chomping at the bit to appear...,"[0.04333487, -0.11163817, -0.270213, -0.076347...","[tensor(-1.7187), tensor(-1.6331), tensor(-1.4...","[-0.1837862879037857, 0.4726242423057556, -0.4...",extracted_data/img_small/0936f771136a2c42b1964...
6,7,ccc07dc8f51660232297409a6857eb98c02f00fc,they could become one of the premier league 's...,The re-signing of zlatan ibrahimovic on a one-...,"[0.1055755, -0.12388456, -0.039798126, -0.0552...","[tensor(-0.9841), tensor(-1.0619), tensor(0.09...","[-0.22483065724372864, 0.7355657815933228, 0.1...",extracted_data/img_small/ccc07dc8f516602322974...
7,8,b59e86169bfc81b8d87abcde78f680118cfd3e6e,two cousins charged with the brutal deaths of ...,"cosmo dinardo, a 20-year-old drug dealer with ...","[0.10521533, -0.13344398, -0.0568666, -0.02211...","[tensor(1.5255), tensor(0.2995), tensor(-0.374...","[-1.061955451965332, 0.44963696599006653, -0.0...",extracted_data/img_small/b59e86169bfc81b8d87ab...
8,9,ccf503f94bff06e5b3028321ead8c227ec1230b2,former democratic national committee chair rep...,Former democratic national committee chair rep...,"[0.061565414, -0.068867505, -0.1003509, -0.038...","[tensor(-0.7756), tensor(-0.9958), tensor(0.28...","[-0.5605597496032715, 0.6661292910575867, 0.11...",extracted_data/img_small/ccf503f94bff06e5b3028...
9,10,194cdfcb73a1d9c5b4a17763c4df87d8cdbe1c45,a disgusted easyjet passenger has complained a...,A disgusted easyjet passenger has complained a...,"[0.033479735, -0.11282287, -0.19271486, -0.038...","[tensor(-0.0837), tensor(-0.2329), tensor(-0.4...","[-0.6560893058776855, 0.6043345928192139, 0.04...",extracted_data/img_small/194cdfcb73a1d9c5b4a17...


### **用餘弦定理計算summary與圖像的相關性**

In [14]:
# 餘弦定理
import os
import torch
from torchvision import models, transforms
from PIL import Image
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity


resnet = models.resnet50(pretrained=True)
resnet.eval()


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_model.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_image_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(image)
    return features.cpu().numpy().flatten()[:768]

def extract_text_features(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

def find_most_relevant_image(file_name, summary):
    base_path = f'extracted_data/img_small/{file_name}_'
    text_features = extract_text_features(summary).flatten()
    best_image = None
    best_similarity = -1

    i = 1
    while os.path.exists(f'{base_path}{i}.jpg'):
        img_path = f'{base_path}{i}.jpg'
        image_features = extract_image_features(img_path)
        similarity = cosine_similarity([text_features], [image_features])[0][0]

        if similarity > best_similarity:
            best_similarity = similarity
            best_image = img_path

        i += 1

    return best_image, best_similarity


data['result'] = data.apply(lambda x: find_most_relevant_image(x['fileName'], x['summary']), axis=1)
data['most_relevant_image'] = data['result'].apply(lambda x: x[0])
data['image_text_relevance'] = data['result'].apply(lambda x: x[1])
data.drop('result', axis=1, inplace=True)




In [None]:
average_relevance_score = data['image_text_relevance'].mean()

print("餘弦定理平均圖文相關度分數為:", average_relevance_score)

### **用歐式距離計算summary與圖像的相關性**

In [17]:
# 歐式距離
import os
import torch
from torchvision import models, transforms
from PIL import Image
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import euclidean_distances


resnet = models.resnet50(pretrained=True)
resnet.eval()

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_model.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_image_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(image)
    return features.cpu().numpy().flatten()[:768]

def extract_text_features(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()

def find_most_relevant_image(file_name, summary):
    base_path = f'extracted_data/img_small/{file_name}_'
    text_features = extract_text_features(summary).flatten()
    best_image = None
    best_distance = float('inf')

    i = 1
    while os.path.exists(f'{base_path}{i}.jpg'):
        img_path = f'{base_path}{i}.jpg'
        image_features = extract_image_features(img_path)
        distance = euclidean_distances([text_features], [image_features])[0][0]

        if distance < best_distance:
            best_distance = distance
            best_image = img_path

        i += 1

    return best_image, best_distance


data['result'] = data.apply(lambda x: find_most_relevant_image(x['fileName'], x['summary']), axis=1)
data['most_relevant_image'] = data['result'].apply(lambda x: x[0])
data['image_text_relevance'] = data['result'].apply(lambda x: x[1])
data.drop('result', axis=1, inplace=True)




In [None]:
average_relevance_score = data['image_text_relevance'].mean()

print("歐式距離平均圖文相關度分數為:", average_relevance_score)