In [None]:
'''
Tensorflow请使用2.15版本
'''

In [8]:
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dense, Flatten, concatenate, LSTM, Dropout, Embedding
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.utils import to_categorical
import os

In [2]:
# 读入数据
train_data = pd.read_csv("train.txt")
test_data = pd.read_csv("test_without_label.txt")

# 把标签转化为数值
dic = {'negative':0, 'neutral':1, 'positive':2}
train_data['tag'] = train_data['tag'].map(dic)
train_data.head()

# 分离图片和文字数据
def read_img(id):
  img = image.load_img('data/'+ str(id) + '.jpg',target_size=(224,224,3))
  img = image.img_to_array(img)
  img = img/255
  return img

def read_txt(id):
    with open('data/'+ str(id)+ '.txt', 'r', encoding='GBK', errors='ignore') as file:
        data = file.read().replace('\n', '')
    return data

train_data['text'] = train_data['guid'].apply(read_txt)
train_data['img'] = train_data['guid'].apply(read_img)

# 分割训练集验证集
train, val = train_test_split(train_data, test_size=0.2)

In [3]:
train

Unnamed: 0,guid,tag,text,img
2595,2845,2,RT @orrie_yes: Need to calm myself so here's o...,"[[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,..."
2982,430,0,#ANIMALABUSE #TORONTO #PUPPY #TORTURE WE OFFER...,"[[[0.96862745, 0.96862745, 0.96862745], [0.984..."
246,4392,2,RT @WorIdStarComedy: #TodaysKidsWillNeverKnow ...,"[[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,..."
862,4012,2,Thank u for your understanding heart and shini...,"[[[0.8156863, 0.67058825, 0.49411765], [0.8039..."
1941,2379,1,RT @theIeansquad: When you rob a black persons...,"[[[0.07058824, 0.07058824, 0.07058824], [0.070..."
...,...,...,...,...
1557,4193,2,Euro buoyant ahead of Greek vote http://t.co/q...,"[[[0.72156864, 0.6784314, 0.43529412], [0.7254..."
2997,333,0,"#trashcomics lmao dick, so incensed. HOW COULD...","[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."
3004,1652,2,#February #Winter #Rainy #Stormy #Windy #Tuesd...,"[[[0.4, 0.38039216, 0.30588236], [0.3764706, 0..."
3687,2609,2,RT @neiltyson: I once showed Pluto to Pluto. H...,"[[[0.39215687, 0.4392157, 0.29803923], [0.2431..."


In [4]:
## 图片处理部分
input_image = Input(shape=(224,224,3))
base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=input_image)
x = base_model.output
x = Flatten()(x)
output_image = Dense(128, activation='relu')(x)

## 文字处理部分
# 可以通过调用Transformer库中的开源模型生成更好的embedding
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train['text'].values)
word_index = tokenizer.word_index

train_X = tokenizer.texts_to_sequences(train['text'].values)
val_X = tokenizer.texts_to_sequences(val['text'].values)

train_X = pad_sequences(train_X, maxlen=500)
val_X = pad_sequences(val_X, maxlen=500)

Y = pd.get_dummies(train['tag']).values
X_img = np.array(train['img'].tolist())
val_X_img = np.array(val['img'].tolist())


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [31]:
# 模型1
input_text = Input(shape=(500, ))
emb_text = Embedding(len(word_index) + 1, 100)(input_text)
lstm_out = LSTM(300,dropout=0.2, recurrent_dropout=0.2,return_sequences=True)(emb_text)
x = Flatten()(lstm_out)
x = Dense(200,activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(100,activation='relu')(x)

merge = concatenate([x, output_image])
output = Dense(3,activation='softmax')(merge)

model = Model(inputs=[input_text, input_image],outputs =output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

val_Y = pd.get_dummies(val['tag']).values

model.fit([train_X, X_img], Y, validation_data=([val_X, val_X_img], val_Y), epochs=2, batch_size=32, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x4371aa740>

In [None]:
# # 模型2（更复杂，训练起来更慢，供参考）
# from keras.layers import Input, Embedding, LSTM, Dense, Dropout, Flatten, BatchNormalization
# from keras.models import Model
# from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# input_text = Input(shape=(500, ))
# emb_text = Embedding(len(word_index) + 1, 100)(input_text)
# lstm_out = LSTM(300, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(emb_text)
# lstm_out = Dropout(0.5)(lstm_out)  # 提高Dropout比例
# lstm_out = BatchNormalization()(lstm_out)  # 添加批归一化层
# flat_text = Flatten()(lstm_out)
# dense_text_1 = Dense(200, activation='relu', kernel_regularizer='l2')(flat_text)  # 添加L2正则化
# dense_text_1 = Dropout(0.5)(dense_text_1)  # 提高Dropout比例
# dense_text_2 = Dense(100, activation='relu', kernel_regularizer='l2')(dense_text_1)  # 添加L2正则化

# merge = concatenate([dense_text_2, output_image])
# output = Dense(3, activation='softmax')(merge)

# model = Model(inputs=[input_text, input_image], outputs=output)
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # 添加早停法，防止过拟合
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

# # 使用ReduceLROnPlateau来减少学习率，当指标停止提升时
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1, min_lr=1e-6)

# val_Y = pd.get_dummies(val['tag']).values

# # 添加callbacks到fit函数
# model.fit(
#     [train_X, X_img], Y,
#     validation_data=([val_X, val_X_img], val_Y),
#     epochs=3,
#     batch_size=32,
#     verbose=1,
#     callbacks=[early_stopping, reduce_lr]
# )

In [32]:
# 读取测试数据集
test_data = pd.read_csv('test_without_label.txt')

test_data['text'] = test_data['guid'].apply(read_txt)
test_data['img'] = test_data['guid'].apply(read_img)


In [33]:
# 文本和图片的预处理
test_txt = tokenizer.texts_to_sequences(test_data['text'].values)
test_txt = pad_sequences(test_txt, 500)
test_img = np.array(test_data['img'].tolist())

# 使用模型进行预测
pred = model.predict([test_txt, test_img])
labels = np.argmax(pred, axis=-1)



In [34]:
# 生成标签
dic_reverse = {v: k for k, v in dic.items()}
labels = [dic_reverse.get(x) for x in list(np.argmax(pred, axis=-1))]

In [35]:
# 写入预测结果
with open("test_with_label.txt", "w") as outfile:
    outfile.write('guid,tag\n')
    for guid, label in zip(test_data['guid'], labels):
        outfile.write('{},{}\n'.format(guid, label))