In [1]:
test_size=0.5
k=5

In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
n_neighbors=k
###############################################
# 1. Text Prior: obtain hidden state of first Excel row via BERT
###############################################
# Please update text_data_path to your Excel file path
text_data_path = '../Sample Data Texts.xlsx'
df_text = pd.read_excel(text_data_path)
# Retrieve the first row from the "List of Store Names" column
first_text = df_text['List of Store Names'].iloc[0]
print("First row text:", first_text)

# Use a pre-trained BERT model (e.g., bert-base-chinese if your text is Chinese)
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
bert_model = BertModel.from_pretrained("bert-base-chinese")
bert_model.eval()  # Freeze BERT parameters
with torch.no_grad():
    inputs = tokenizer(first_text, return_tensors="pt", truncation=True, padding=True)
    outputs = bert_model(**inputs)
    # Take the [CLS] token hidden state, shape (1, 768)
    text_hidden_state = outputs.last_hidden_state[:, 0, :]
    text_hidden_state = text_hidden_state.squeeze(0)  # Shape becomes (768,)
print("Text hidden state shape:", text_hidden_state.shape)

###############################################
# 2. Image Loading and Preprocessing
###############################################
# Please update img_data_dir to your image dataset folder path
# Assume each subfolder name corresponds to a class label
img_data_dir = '../raw'

image_features = []  # To store raw image features (1024 dims)
labels = []          # To store class labels

for class_name in os.listdir(img_data_dir):
    class_path = os.path.join(img_data_dir, class_name)
    if os.path.isdir(class_path):
        for filename in os.listdir(class_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                try:
                    img = Image.open(os.path.join(class_path, filename))
                    img = img.convert('L')           # Convert to grayscale
                    img = img.resize((32, 32))       # Resize to 32×32
                    img_array = np.array(img).flatten()  # Flatten to a 1024-dim vector
                    image_features.append(img_array)
                    labels.append(class_name)
                except Exception as e:
                    print(f"Error reading {os.path.join(class_path, filename)}:", e)

# Convert to NumPy arrays and normalize to [0,1]
image_features = np.array(image_features, dtype='float32') / 255.0
labels = np.array(labels)
print("Number of image samples:", image_features.shape[0])
print("Original image feature dimension:", image_features.shape[1])
print("Classes:", np.unique(labels))

###############################################
# 3. Split into Training and Test Sets
###############################################
X_train, X_test, y_train, y_test = train_test_split(
    image_features, labels, test_size=test_size, stratify=labels
)
print("Training samples:", X_train.shape[0], "Test samples:", X_test.shape[0])

###############################################
# 4. Dimensionality Reduction on Image Features (PCA)
###############################################
# Reduce image features to 128 dimensions
pca = PCA(n_components=128)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced  = pca.transform(X_test)
print("Reduced training feature dimension:", X_train_reduced.shape[1])

###############################################
# 5. Multimodal Feature Construction: concatenate image features with text prior
###############################################
# Text prior is the fixed 768-dim vector obtained from BERT
def fuse_features(X, text_vector):
    n_samples = X.shape[0]
    # Expand text vector for each sample
    text_expanded = np.tile(text_vector.cpu().numpy(), (n_samples, 1))  # Shape: (n_samples, 768)
    fused = np.concatenate([X, text_expanded], axis=1)  # Resulting shape: (n_samples, 128+768=896)
    return fused

fused_train = fuse_features(X_train_reduced, text_hidden_state)
fused_test  = fuse_features(X_test_reduced, text_hidden_state)
print("Fused feature dimension:", fused_train.shape[1])

###############################################
# 6. Train KNN Model
###############################################
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(fused_train, y_train)
y_pred = knn.predict(fused_test)

acc = accuracy_score(y_test, y_pred)
print("Test set accuracy:", acc)
print("Classification report:")
print(classification_report(y_test, y_pred, target_names=np.unique(labels)))


First row text: WM HOUSE, MC HOUSE 展览馆, 交通银行, 良品铺子
Text hidden state shape: torch.Size([768])
Number of image samples: 3344
Original image feature dimension: 1024
Classes: ['Eh-1-1' 'Eh-1-2' 'Eh-1-3' 'Eh-1-4' 'N-1-1' 'N-1-2' 'N-1-3' 'N-1-4'
 'N-1-5']
Training samples: 1672 Test samples: 1672
Reduced training feature dimension: 128
Fused feature dimension: 896
Test set accuracy: 0.65311004784689
Classification report:
              precision    recall  f1-score   support

      Eh-1-1       0.60      0.83      0.70       187
      Eh-1-2       0.59      0.80      0.68       303
      Eh-1-3       0.68      0.75      0.72       211
      Eh-1-4       0.74      0.82      0.78       243
       N-1-1       0.78      0.62      0.69       434
       N-1-2       0.67      0.16      0.26       110
       N-1-3       0.43      0.30      0.35        64
       N-1-4       1.00      0.10      0.19        29
       N-1-5       0.33      0.31      0.32        91

    accuracy                         