In [None]:
import os
import shutil

# 分割原始文件，将txt文件与jpg文件分开
source_folder = "data"
txt_folder = "txt_data"
jpg_folder = "jpg_data"

# 创建目标文件夹
os.makedirs(txt_folder, exist_ok=True)
os.makedirs(jpg_folder, exist_ok=True)

# 获取源文件夹下所有文件
file_list = os.listdir(source_folder)

# 遍历文件列表，将txt文件和jpg文件分别复制到对应的文件夹
for file_name in file_list:
    if file_name.endswith(".txt"):
        shutil.copy(os.path.join(source_folder, file_name), os.path.join(txt_folder, file_name))
    elif file_name.endswith(".jpg"):
        shutil.copy(os.path.join(source_folder, file_name), os.path.join(jpg_folder, file_name))

txt文件已复制到: txt_data
jpg文件已复制到: jpg_data


In [None]:
# 读入并处理训练数据
import os
from sklearn.model_selection import train_test_split


# 读取有标注的情感预测数据
traindata = open('train.txt').readlines()[1:]
txt_label = {}
for content in traindata:
    temp = content.split(',')
    # 将三种标签转化为0，1，2
    if (temp[1] == "positive\n"):
        txt_label[int(temp[0])] = 0
    if (temp[1] == "neutral\n"):
        txt_label[int(temp[0])] = 1
    if (temp[1] == "negative\n"):
        txt_label[int(temp[0])] = 2

# 按1、2、3……的顺序重新排序
# txt_label = sorted(txt_label.items(), key=lambda v:v[0])
# print(txt_label)

# 建立dataset，以期存入每个guid所对应的情感标签，也便于后续存入文本内容和图片内容
dataset = []
for key,value in txt_label.items():
    dataset.append({
        'guid': key,
        'label': value,
        'text' : 0,
        'photo' : 'data/' + str(key) + '.jpg',    # 直接手动添加图片路径
    })
print(dataset[1])

# 读取文本数据
txt_data = {}
#path_to_folder = 'txt_data'
for file_name in os.listdir('txt_data/'):
    with open(os.path.join('txt_data/', file_name), 'r', encoding='gb18030') as file:
        temp = file.readlines()
        temp_label = int(file_name.split('.')[0])
        # 将文本数据和前面的有标注的情感预测数据一一对应起来
        for i in range(len(dataset)):
            if temp_label == dataset[i]['guid']:
                dataset[i]['text'] = temp

#print(dataset[1])

# 划分训练集和验证集
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
print(len(train_data))
print(len(val_data))


{'guid': 26, 'label': 1, 'text': 0, 'photo': 'data/26.jpg'}
3200
800


In [None]:
# 读入待预测的数据
# 读取有标注的情感预测数据
testdata = open('test_without_label.txt').readlines()[1:]
txt_label2 = {}
for content in testdata:
    temp = content.split(',')
    # 用3代表null
    txt_label2[int(temp[0])] = 3

# 建立dataset，以期存入每个guid所对应的情感标签，也便于后续存入文本内容和图片内容
test_dataset = []
for key,value in txt_label2.items():
    test_dataset.append({
        'guid': key,
        'label': value,
        'text' : 0,
        'photo' : 'data/' + str(key) + '.jpg',    # 直接手动添加图片路径
    })
print(test_dataset[1])

# 读取文本数据
txt_data = {}
#path_to_folder = 'txt_data'
for file_name in os.listdir('txt_data/'):
    with open(os.path.join('txt_data/', file_name), 'r', encoding='gb18030') as file:
        temp = file.readlines()
        temp_label = int(file_name.split('.')[0])
        # 将文本数据和前面的有标注的情感预测数据一一对应起来
        for i in range(len(test_dataset)):
            if temp_label == test_dataset[i]['guid']:
                test_dataset[i]['text'] = temp
'''
# 提取图片信息
for i in range(len(test_dataset)):
    photo_path = test_dataset[i]['photo']
    photo_info = Image.open(photo_path)
    photo_info = transform(photo_info)
    test_dataset[i]['photo'] = photo_info
'''

print(len(test_dataset))

{'guid': 1576, 'label': 3, 'text': 0, 'photo': 'data/1576.jpg'}
511


In [None]:
# 将训练集、验证集、测试集写入json文件中，便于后续调用
import json
        
with open("train.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f)
with open("val.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f)
with open("test.json", "w", encoding="utf-8") as f:
    json.dump(test_dataset, f)