In [101]:
import os
import random
import numpy as np
import pandas as pd
import scipy.io
from datetime import datetime
from statistics import median

In [11]:
def read_txt_lines(file_path):
    """逐行读取txt文件"""
    with open(file_path, 'r') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    return lines

In [12]:
read_txt_lines("../data/original_food_intake_data/01_DryNoodles/01/time.txt")

['2022-10-15 12:15:57.356631',
 '2022-10-15 12:15:57.441168',
 '2022-10-15 12:15:57.470091',
 '2022-10-15 12:15:57.502005',
 '2022-10-15 12:15:57.531925',
 '2022-10-15 12:15:57.562842',
 '2022-10-15 12:15:57.593759',
 '2022-10-15 12:15:57.609717',
 '2022-10-15 12:15:57.639641',
 '2022-10-15 12:15:57.672551',
 '2022-10-15 12:15:57.751337',
 '2022-10-15 12:15:57.783253',
 '2022-10-15 12:15:57.815169',
 '2022-10-15 12:15:57.847509',
 '2022-10-15 12:15:57.877429',
 '2022-10-15 12:15:57.893386',
 '2022-10-15 12:15:57.912336',
 '2022-10-15 12:15:57.944249',
 '2022-10-15 12:15:57.976767',
 '2022-10-15 12:15:58.009679',
 '2022-10-15 12:15:58.041594',
 '2022-10-15 12:15:58.072512',
 '2022-10-15 12:15:58.152350',
 '2022-10-15 12:15:58.184463',
 '2022-10-15 12:15:58.217107',
 '2022-10-15 12:15:58.248011',
 '2022-10-15 12:15:58.264964',
 '2022-10-15 12:15:58.280164',
 '2022-10-15 12:15:58.313307',
 '2022-10-15 12:15:58.344798',
 '2022-10-15 12:15:58.376420',
 '2022-10-15 12:15:58.409953',
 '2022-1

In [13]:
def get_start_end_timestamp_index_eat(file_path):
    file = pd.read_csv(file_path)
    video_start_frame_eat = file['video_start_frame']
    video_end_frame_eat = file['video_end_frame']
    start_end_timestamp_index_eat = list(zip(video_start_frame_eat, video_end_frame_eat))
    return start_end_timestamp_index_eat

In [14]:
get_start_end_timestamp_index_eat("../data/original_food_intake_data/01_DryNoodles/01/data.csv")

[(543, 3688), (4015, 9340)]

In [15]:
def get_start_end_timestamp_eat(range_csv_path, timestamp_txt_path):
    timestamps = read_txt_lines(timestamp_txt_path)
    start_end_timestamp_index_eat = get_start_end_timestamp_index_eat(range_csv_path)
    start_end_timestamp_eat = []
    for i in range(len(start_end_timestamp_index_eat)):
        start_index = start_end_timestamp_index_eat[i][0]
        end_index = start_end_timestamp_index_eat[i][1]
        start_timestamp = datetime.strptime(timestamps[start_index].replace(' ', 'T').strip(), '%Y-%m-%dT%H:%M:%S.%f') 
        end_timestamp = datetime.strptime(timestamps[end_index].replace(' ', 'T').strip(), '%Y-%m-%dT%H:%M:%S.%f')
        start_end_timestamp_eat.append((start_timestamp, end_timestamp))
    return start_end_timestamp_eat

In [21]:
start_end_timestamp_eat = get_start_end_timestamp_eat("../data/original_food_intake_data/01_DryNoodles/01/data.csv", 
                                                      "../data/original_food_intake_data/01_DryNoodles/01/time.txt")
start_end_timestamp_eat

[(datetime.datetime(2022, 10, 15, 12, 16, 15, 577171),
  datetime.datetime(2022, 10, 15, 12, 18, 0, 551483)),
 (datetime.datetime(2022, 10, 15, 12, 18, 11, 481154),
  datetime.datetime(2022, 10, 15, 12, 21, 9, 319780))]

In [22]:
def get_single_sensor_data(start_end_timestamp_eat, sensor_path):
    """获取一类中一个人的一个传感器数据"""
    with open(sensor_path, 'r') as file:
        data = [line.split() for line in file]

    for i in range(0, len(data)):
        # 0.899 0.512 -0.035 2022-10-15T12:21:20.337
        # 0.895 0.493 -0.016  2022-10-15 12:16:05.340
        if len(data[i]) == 5:
            data[i] = [data[i][0], data[i][1], data[i][2], data[i][3] + 'T' + data[i][4]]
        data[i][3] = datetime.strptime(data[i][3], '%Y-%m-%dT%H:%M:%S.%f')

    data_eat, data_not_eat = [], []
    for row in data:
        flag = 0
        for i in range(len(start_end_timestamp_eat)):
            if row[3] < start_end_timestamp_eat[i][0] or row[3] > start_end_timestamp_eat[i][1]:
                flag += 1
        if flag != len(start_end_timestamp_eat):
            data_eat.append(row)
        else:
            data_not_eat.append(row)
            
    return data_eat, data_not_eat

In [28]:
acc_eat, acc_not_eat = get_single_sensor_data(start_end_timestamp_eat, "../data/original_food_intake_data/01_DryNoodles/01/Others/accData.txt")
gyr_eat, gyt_not = get_single_sensor_data(start_end_timestamp_eat, "../data/original_food_intake_data/01_DryNoodles/01/Others/gyrData.txt")
acc_eat

[['-1.091127',
  '-2.416751',
  '9.674182',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 592000)],
 ['-1.122234',
  '-2.658426',
  '9.274581',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 613000)],
 ['-1.294517',
  '-1.473979',
  '9.217153',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 633000)],
 ['-1.275374',
  '-1.055235',
  '8.886944',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 652000)],
 ['-1.048056',
  '-1.399801',
  '8.999406',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 673000)],
 ['-0.993021',
  '-0.634098',
  '9.028119',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 692000)],
 ['-1.124627',
  '0.385244',
  '9.640682',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 713000)],
 ['-0.973879',
  '0.007178',
  '9.961320',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 734000)],
 ['-1.158126',
  '-0.665205',
  '8.927622',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 752000)],
 ['-1.378266',
  '-0.842274',
  '9.219545',
  datetime.datetime(2022, 10, 15, 12, 16, 15, 773000)],
 [

In [89]:
def data_split(data_acc, data_gyr, sample_length, stride):
    result = []
    length = min(len(data_acc), len(data_gyr)) - sample_length
    
    for i in range(0, length, stride):
        align_index, align_time_diff = i, abs((data_acc[i][3] - data_gyr[i][3]).total_seconds() * 1000)
        for j in range(0, length):
            if abs((data_acc[i][3] - data_gyr[j][3]).total_seconds() * 1000) < align_time_diff:
                align_index = j
                align_time_diff = abs((data_acc[i][3] - data_gyr[j][3]).total_seconds() * 1000)
        
        row_batch = []
        for k in range(sample_length):
            acc_row = data_acc[i + k]
            gyr_row = data_gyr[align_index + k]
            new_row = []
            new_row.extend(acc_row[0: 3])
            new_row.extend(gyr_row[0: 3])
            row_batch.append(new_row)
        result.append(row_batch)
        
        i -= (sample_length - stride)
        
    print("处理结果: acc数据总条数: {},\tgyr数据总条数: {},\t切割得到样本数量: {}".format(len(data_acc), len(data_gyr), len(result)))
    
    return result

In [90]:
def get_split_sensor_data(range_csv_path, timestamp_txt_path, sensor_acc_path, sensor_gyr_path, sample_length, stride):
    start_end_timestamp_eat = get_start_end_timestamp_eat(range_csv_path, timestamp_txt_path)
    
    data_eat_acc, data_not_eat_acc = get_single_sensor_data(start_end_timestamp_eat, sensor_acc_path)
    data_eat_gyr, data_not_eat_gyr = get_single_sensor_data(start_end_timestamp_eat, sensor_gyr_path)

    eat_result = data_split(data_eat_acc, data_eat_gyr, sample_length, stride)
    not_eat_result = data_split(data_not_eat_acc, data_not_eat_gyr, sample_length, stride)
    
    return eat_result, not_eat_result

In [91]:
eat_result, not_eat_result = get_split_sensor_data("../data/original_food_intake_data/01_DryNoodles/01/data.csv", 
                                                   "../data/original_food_intake_data/01_DryNoodles/01/time.txt", 
                                                   "../data/original_food_intake_data/01_DryNoodles/01/Others/accData.txt", 
                                                   "../data/original_food_intake_data/01_DryNoodles/01/Others/gyrData.txt", 512, 128)

处理结果: acc数据总条数: 14148,	gyr数据总条数: 14148,	切割得到样本数量: 107
处理结果: acc数据总条数: 1545,	gyr数据总条数: 1542,	切割得到样本数量: 9


In [92]:
def save_111_data(parent_path, data, category_index, person_index, train, eat):
    """保存一个类别下一个人的进食或非进食的训练/测试数据"""
    data_len = len(data)
    zfill_size = 3 if data_len > 100 else 2
    
    parent_path = os.path.join(parent_path, str(category_index).zfill(2))
    parent_path = os.path.join(parent_path, "train") if train else os.path.join(parent_path, "test")
    parent_path = os.path.join(parent_path, "eat") if eat else os.path.join(parent_path, "not_eat")
    os.makedirs(os.path.dirname(parent_path), exist_ok=True)

    
    for i in range(0, data_len):
        acc_data = np.array([row[0: 3] for row in data[i]], dtype=np.single)
        gyr_data = np.array([row[3: 6] for row in data[i]], dtype=np.single)

        label_value = category_index if eat else 0
        label_data = np.full(len(acc_data), label_value, dtype=np.int32).reshape((len(acc_data), 1))
        
        person_data = np.full(len(acc_data), person_index, dtype=np.int32).reshape((len(acc_data), 1))
        
        mat_file_name = "{}_{}_{}.mat".format(str(category_index).zfill(2), str(person_index).zfill(2), str(i).zfill(zfill_size))
        mat_file_path = os.path.join(parent_path, mat_file_name)
        os.makedirs(os.path.dirname(mat_file_path), exist_ok=True)
        scipy.io.savemat(mat_file_path, {"accData": acc_data, "gyrData": gyr_data, "label": label_data, "person": person_data})
    
def save_112_data(parent_path, eat_data, not_eat_data, category_index, person_index, train):
    """保存一个类别下一个人的进食和非进食的训练/测试数据"""
    save_111_data(parent_path, eat_data, category_index, person_index, train, True)
    save_111_data(parent_path, not_eat_data, category_index, person_index, train, False)    

In [93]:
save_112_data("../data/test01", eat_result, not_eat_result, 1, 1, True)

In [94]:
def save_all_data_cross_person():
    category_num, person_num, train_person_num, sample_length, stride = 11, 10, 8, 512, 128
    original_data_root_path = "../data/original_food_intake_data"
    data_save_root_path = f"../data/{sample_length}_{stride}_{train_person_num}_cross_person"
    category_dirs = [entry.name for entry in os.scandir(original_data_root_path) if entry.is_dir()]

    # 根据类别分别处理并保存数据
    for i in range(1, category_num + 1):
        # 获取处理并保存当前类别数据所必须的文件夹目录
        category_dir = os.path.join(original_data_root_path, category_dirs[i - 1])
        person_dirs = [f.name for f in os.scandir(category_dir) if f.is_dir()]
        for j in range(1, person_num + 1):
            print(f"正在处理第 {i} 类的第 {j} 个人的数据...")
            
            temp = os.path.join(category_dir, person_dirs[j - 1])
            range_csv_path = os.path.join(temp, "data.csv")
            timestamp_txt_path = os.path.join(temp, "time.txt")
            sensor_acc_path = os.path.join(temp, "Others/accData.txt")
            sensor_gyr_path = os.path.join(temp, "Others/gyrData.txt")

            # 一个类别下一个人的进食和非进食数据
            eat_data, not_eat_data = get_split_sensor_data(range_csv_path, timestamp_txt_path, sensor_acc_path, sensor_gyr_path, sample_length, stride)

            # 划分训练数据与验证数据
            is_train_person = j <= train_person_num
            save_112_data(data_save_root_path,  eat_data, not_eat_data, i, j, is_train_person)

In [95]:
save_all_data_cross_person()

正在处理第 1 类的第 1 个人的数据...
处理结果: acc数据总条数: 14148,	gyr数据总条数: 14148,	切割得到样本数量: 107
处理结果: acc数据总条数: 1545,	gyr数据总条数: 1542,	切割得到样本数量: 9
正在处理第 1 类的第 2 个人的数据...
处理结果: acc数据总条数: 15797,	gyr数据总条数: 15797,	切割得到样本数量: 120
处理结果: acc数据总条数: 2412,	gyr数据总条数: 2407,	切割得到样本数量: 15
正在处理第 1 类的第 3 个人的数据...
处理结果: acc数据总条数: 24876,	gyr数据总条数: 24877,	切割得到样本数量: 191
处理结果: acc数据总条数: 1170,	gyr数据总条数: 1163,	切割得到样本数量: 6
正在处理第 1 类的第 4 个人的数据...
处理结果: acc数据总条数: 22743,	gyr数据总条数: 22743,	切割得到样本数量: 174
处理结果: acc数据总条数: 1421,	gyr数据总条数: 1416,	切割得到样本数量: 8
正在处理第 1 类的第 5 个人的数据...
处理结果: acc数据总条数: 15348,	gyr数据总条数: 15348,	切割得到样本数量: 116
处理结果: acc数据总条数: 879,	gyr数据总条数: 877,	切割得到样本数量: 3
正在处理第 1 类的第 6 个人的数据...
处理结果: acc数据总条数: 15997,	gyr数据总条数: 15997,	切割得到样本数量: 121
处理结果: acc数据总条数: 429,	gyr数据总条数: 426,	切割得到样本数量: 0
正在处理第 1 类的第 7 个人的数据...
处理结果: acc数据总条数: 13509,	gyr数据总条数: 13509,	切割得到样本数量: 102
处理结果: acc数据总条数: 2904,	gyr数据总条数: 2900,	切割得到样本数量: 19
正在处理第 1 类的第 8 个人的数据...
处理结果: acc数据总条数: 17790,	gyr数据总条数: 17790,	切割得到样本数量: 135
处理结果: acc数据总条数: 697,	gyr数据总条数: 693,	

In [106]:
def save_all_data_not_cross_person():
    category_num, person_num, train_ratio, sample_length, stride = 11, 10, 0.8, 512, 128
    original_data_root_path = "../data/original_food_intake_data"
    data_save_root_path = f"../data/{sample_length}_{stride}_{train_ratio}_not_cross_person"
    category_dirs = [entry.name for entry in os.scandir(original_data_root_path) if entry.is_dir()]

    # 根据类别分别处理并保存数据
    for i in range(1, category_num + 1):
        # 获取处理并保存当前类别数据所必须的文件夹目录
        category_dir = os.path.join(original_data_root_path, category_dirs[i - 1])
        person_dirs = [f.name for f in os.scandir(category_dir) if f.is_dir()]
        for j in range(1, person_num + 1):
            print(f"正在处理第 {i} 类的第 {j} 个人的数据...")
            
            temp = os.path.join(category_dir, person_dirs[j - 1])
            range_csv_path = os.path.join(temp, "data.csv")
            timestamp_txt_path = os.path.join(temp, "time.txt")
            sensor_acc_path = os.path.join(temp, "Others/accData.txt")
            sensor_gyr_path = os.path.join(temp, "Others/gyrData.txt")

            # 一个类别下一个人的进食和非进食数据
            eat_data, not_eat_data = get_split_sensor_data(range_csv_path, timestamp_txt_path, sensor_acc_path, sensor_gyr_path, sample_length, stride)

            # 划分训练数据与验证数据
            random.shuffle(eat_data), random.shuffle(not_eat_data)
            eat_data_len, not_eat_data_len = len(eat_data), len(not_eat_data)
            eat_data_partition_num, not_eat_data_partition_num = int(eat_data_len * train_ratio), int(not_eat_data_len * train_ratio)
            eat_data_train, eat_data_test = eat_data[:eat_data_partition_num], eat_data[eat_data_partition_num:]
            not_eat_data_train, not_eat_data_test = not_eat_data[:not_eat_data_partition_num], not_eat_data[not_eat_data_partition_num:]
            
            save_112_data(data_save_root_path, eat_data_train, not_eat_data_train, i, j, True)
            save_112_data(data_save_root_path, eat_data_test, not_eat_data_test, i, j, False)

In [107]:
save_all_data_not_cross_person()

正在处理第 1 类的第 1 个人的数据...
处理结果: acc数据总条数: 14148,	gyr数据总条数: 14148,	切割得到样本数量: 107
处理结果: acc数据总条数: 1545,	gyr数据总条数: 1542,	切割得到样本数量: 9
正在处理第 1 类的第 2 个人的数据...
处理结果: acc数据总条数: 15797,	gyr数据总条数: 15797,	切割得到样本数量: 120
处理结果: acc数据总条数: 2412,	gyr数据总条数: 2407,	切割得到样本数量: 15
正在处理第 1 类的第 3 个人的数据...
处理结果: acc数据总条数: 24876,	gyr数据总条数: 24877,	切割得到样本数量: 191
处理结果: acc数据总条数: 1170,	gyr数据总条数: 1163,	切割得到样本数量: 6
正在处理第 1 类的第 4 个人的数据...
处理结果: acc数据总条数: 22743,	gyr数据总条数: 22743,	切割得到样本数量: 174
处理结果: acc数据总条数: 1421,	gyr数据总条数: 1416,	切割得到样本数量: 8
正在处理第 1 类的第 5 个人的数据...
处理结果: acc数据总条数: 15348,	gyr数据总条数: 15348,	切割得到样本数量: 116
处理结果: acc数据总条数: 879,	gyr数据总条数: 877,	切割得到样本数量: 3
正在处理第 1 类的第 6 个人的数据...
处理结果: acc数据总条数: 15997,	gyr数据总条数: 15997,	切割得到样本数量: 121
处理结果: acc数据总条数: 429,	gyr数据总条数: 426,	切割得到样本数量: 0
正在处理第 1 类的第 7 个人的数据...
处理结果: acc数据总条数: 13509,	gyr数据总条数: 13509,	切割得到样本数量: 102
处理结果: acc数据总条数: 2904,	gyr数据总条数: 2900,	切割得到样本数量: 19
正在处理第 1 类的第 8 个人的数据...
处理结果: acc数据总条数: 17790,	gyr数据总条数: 17790,	切割得到样本数量: 135
处理结果: acc数据总条数: 697,	gyr数据总条数: 693,	