## 字数统计任务隐藏状态参数可视化分析

## 数据可视化处理

#### 针对当前数据文件结构，设计一个用于取用数据的类

In [20]:
import os
import json
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import clear_output
from config_parse import VisualConfig
visualconfig = VisualConfig("./visual_config.json")

class HidDataFetch():
    def __init__(self, hid_datas_path, task_pattern) -> None:
        self.config = visualconfig
        self.task_pattern = task_pattern
        self.hid_datas_path = hid_datas_path
        self.dialogues_num = self.count_dirs(self.hid_datas_path)
        print("dialogues_num:", self.dialogues_num)
        self.task_prompts = None
        self.dialogues = []
        self.tag_list = []
        self.datas = []
        self.fetch_all_dialogues()
        self.init_tags()
        self.init_task_prompts()
        # self.fetch_all_datas()

    # 加载所有种类的任务描述
    def init_task_prompts(self):
        with open(self.config.role_setting_prompt_json, 'r', encoding='utf-8') as f:
            role_setting_prompts = json.load(f)
        self.task_prompts = role_setting_prompts[self.task_pattern]

    # 获取特定序号任务的task_prompt序号
    def get_task_prompt_index(self, index):
        tmp_task_prompt = self.dialogues[index]['dialogues'][0]["content"]
        return self.task_prompts.index(tmp_task_prompt)
    
    # 获取全部任务的task_prompt序号列表
    def get_all_task_prompt_index(self):
        tmp_task_prompts = []
        for i in range(self.dialogues_num):
            tmp_task_prompts.append(self.get_task_prompt_index(i))
        return tmp_task_prompts

    # 将所有dialogue.txt读取整合
    def fetch_all_dialogues(self):
        for i in range(self.dialogues_num):
            self.dialogues.append(self.load_dialogue_by_index(i))
    # 将所有隐藏状态参数读取整合
    def fetch_all_datas(self):
        for i in range(self.dialogues_num):
            self.datas.append(self.load_data_by_index(i))

    # 读取对应index的dialogue.txt
    def load_dialogue_by_index(self, index):
        with open(self.hid_datas_path + "/" + str(index) + "/dialogue.txt", "r", encoding="utf-8") as f:
            dialogue = json.load(f)

    # 提取出所有的tag
    def init_tags(self):
        for diagolue in self.dialogues:
            self.tag_list.append(diagolue['tag'])

    # 统计文件夹中下有多少文件和文件夹
    def count_dirs(self, path):
        return len(os.listdir(path))
    
    # 从硬盘中读取对应的dialogue.txt
    def load_dialogue_by_index(self, index):
        if index >= self.dialogues_num:
            print("index out of range")
            return None
        hid_data_path = os.path.join(self.hid_datas_path, str(index), 'dialogue.txt')
        # print(hid_data_path)
        hid_data = {}
        with open(hid_data_path, 'r', encoding='utf-8') as f:
            hid_data = json.load(f)
        return hid_data
    
    # 获取某index对应的完整的dialogue.txt
    def fetch_by_index(self, index):
        return self.dialogues[index]
    
    # 获取某index的tag
    def fetch_tag_by_index(self, index):
        if index >= self.dialogues_num:
            print("index out of range")
            return None
        return self.dialogues[index]['tag']
    
    # 获取某index对应的完整的对话内容
    def fetch_dialogue_by_index(self, index):
        if index >= self.dialogues_num:
            print("index out of range")
            return None
        return self.dialogues[index]['dialogues']

    # 获取符合某个tag的所有index
    def fetch_index_list_by_tag(self, tag):
        return [index for index in range(self.dialogues_num) if self.dialogues[index]['tag'] == tag]

    # 获取某个index下的所有data
    def load_data_by_index(self, index):
        data_dir_path = os.path.join(self.hid_datas_path, str(index))
        datas = []
        for file in sorted(os.listdir(data_dir_path)):
            if file.endswith(".npz"):
                data_path = os.path.join(data_dir_path, file)
                data_f = np.load(data_path)
                datas.append(np.array(list(data_f.values())))
                data_f.close()
        return datas
    
    # 获取某个index下的特定的data
    def load_special_data(self, path_index, round, speaker, file_index, layer = None):
        filename = self.get_special_filename(path_index, round, speaker, file_index)
        data_dir_path = os.path.join(self.hid_datas_path, str(path_index))
        data_path = os.path.join(data_dir_path, filename)
        data_f = np.load(data_path)
        datas = np.array(list(data_f.values()))
        data_f.close()
        if layer is not None:
            return np.array([datas[layer]])
        return datas
    
    # 获取特定隐藏层的file_index
    def get_special_filename(self, path_index, round, speaker, file_index):
        data_dir_path = os.path.join(self.hid_datas_path, str(path_index))
        filename_list = []
        user_filename_list = []
        assistant_filename_list = []
        tmp_round = 0
        for file in sorted(os.listdir(data_dir_path)):
            if file.endswith(".npz"):
                filename_list.append(file)
        base_size = os.path.getsize(os.path.join(data_dir_path, filename_list[1]))
        for file in filename_list:
            file_size = os.path.getsize(os.path.join(data_dir_path, file))
            if file_size > base_size:
                user_filename_list.append([file])
                tmp_round = tmp_round + 1
            else:
                if tmp_round > len(assistant_filename_list): 
                    assistant_filename_list.append([file])
                else:
                    assistant_filename_list[tmp_round-1].append(file)
        if speaker == "user":
            return user_filename_list[round][file_index]
        elif speaker == "assistant":
            return assistant_filename_list[round][file_index]
        else:
            return 0
    
    # 获取user输入文本开始对应的index
    def get_use_text_index_by_index(self, index):
        user_index_list = [index for index, element in enumerate(self.load_data_by_index(index)) if element.shape[1] > 1]
        return user_index_list
        
    # 分情况载入特定数据
    def load_one_state_by_special_position(self, index, round, speaker, token_index_range, layer = None):
        layer_num = 29 if layer is None else 1


        if speaker == "user":
            if type(token_index_range) is int:
                tmp_data = self.load_special_data(index,round,speaker,0, layer)[:, token_index_range, :, :]
                tmp_data = tmp_data.reshape(layer_num, 4096)
            else:
                tmp_data = self.load_special_data(index,round,speaker,0, layer)[:, token_index_range[0]:None if token_index_range[1] == 0 else token_index_range[1], :, :]
                tmp_data = tmp_data.reshape(layer_num, token_index_range[1]-token_index_range[0],4096)
                tmp_data = np.transpose(tmp_data, (1, 0, 2))
        elif speaker == "assistant":
            if type(token_index_range) is int:
                tmp_data = self.load_special_data(index,round,speaker,token_index_range, layer)
                tmp_data = tmp_data.reshape(layer_num, 4096)
            else:
                tmp_data = self.load_special_data(index, round, speaker, token_index_range[0], layer)
                tmp_data = tmp_data.reshape(1, layer_num, 4096)
                for i in range(token_index_range[0]+1, token_index_range[1], -1 if token_index_range[0] else 1):
                    tmp_data = np.concatenate((tmp_data, self.load_special_data(index,round,speaker,i, layer).reshape(1, layer_num, 4096)), axis=0)
        tmp_label = str(self.fetch_tag_by_index(index))
        return index, tmp_data, tmp_label  # 返回索引和数据
    
    # 获取全部数据中特定位置（最后一次用户输入后）的特定隐藏层参数
    def load_all_state_by_special_position(self, round, speaker, token_index_range, layer = None):


        # 初始化列表
        datas = [None] * len(self.dialogues)
        labels = [None] * len(self.dialogues)
        count = 0
        with ThreadPoolExecutor(max_workers=64) as executor:
            futures = [executor.submit(self.load_one_state_by_special_position, i, round, speaker, token_index_range, layer) for i in range(len(self.dialogues))]
            total_dialogues = len(self.dialogues)
            for future in as_completed(futures):
                i, tmp_data, tmp_label = future.result()
                datas[i] = tmp_data  # Use the index to insert at the correct position
                labels[i] = tmp_label
                count += 1
                # Calculate the percentage of completion
                percent_complete = (count / total_dialogues) * 100
                # Print the real-time percentage progress
                if count % 10 == 0:
                    clear_output(wait=True)
                    print(f"Progress: {percent_complete:.2f}% Complete ({count}/{total_dialogues})")

        return np.array(datas), np.array(labels)
        # return datas, labels
    




#### 加载数据

In [21]:
trans_data_fetcher = HidDataFetch("./datas/translator", "translator")
count_data_fetcher = HidDataFetch("./datas/counter", "counter")

dialogues_num: 1034
dialogues_num: 1000


In [22]:
trans_datas,trans_labels = trans_data_fetcher.load_all_state_by_special_position(-1, 'user', -2)
count_datas,count_labels= count_data_fetcher.load_all_state_by_special_position(-1, 'user', -2)

Progress: 100.00% Complete (1000/1000)


In [23]:
trans_task_kind_list = trans_data_fetcher.get_all_task_prompt_index()
count_task_kind_list = count_data_fetcher.get_all_task_prompt_index()

#### 获取目标数据

在这里，我们首先尝试获取了所有任务中，用户将最后一次任务完整输入后，最后一层transformer隐藏层的输出。

In [24]:
# 只取最后一层隐藏层输出
trans_datas_layer29 = np.array([data[28] for data in trans_datas])
count_datas_layer29 = np.array([data[28] for data in count_datas])

In [25]:
# 获取labels，没能够成功识别的，默认为正确完成翻译任务
trans_labels_int = np.array([int(y) if y=='0' or y=='1' else 0  for y in trans_labels])
count_labels_int = np.array([int(y)+2 if y=='0' or y=='1' else 2  for y in count_labels])
trans_task_kind_list_int = np.array([int(y) for y in trans_task_kind_list])
count_task_kind_list_int = np.array([int(y)+10 for y in count_task_kind_list])

In [26]:
datas_layer29 = np.concatenate((trans_datas_layer29, count_datas_layer29), axis=0)
labels_int = np.concatenate((trans_labels_int, count_labels_int), axis=0)
task_kind_list = np.concatenate((trans_task_kind_list_int, count_task_kind_list_int), axis=0)
data_id = np.array(range(len(datas_layer29)))

#### 准备数据三维可视化

In [28]:
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

def data_draw_3d(dime_re_tool, datas, labels, color_map = None, hover_data = {}):
    # 应用t-SNE降维到三维
    # tsne = TSNE(n_components=3, perplexity=30, random_state=42)
    data_tsne = dime_re_tool.fit_transform(datas)
    # 将降维后的数据和标签转换为DataFrame
    df = pd.DataFrame(data_tsne, columns=['x', 'y', 'z'])
    df['fig_label'] = labels.astype(str)  # 添加标签列
    for key, datas in hover_data.items():
        print(key)
        df[key] = datas
    # 使用Plotly创建交互式三维散点图
    fig = px.scatter_3d(df, x='x', y='y', z='z',
                        color='fig_label', 
                        labels={'label': 'Label'},
                        color_discrete_map=color_map,
                        hover_data=['fig_label']+[key for key in hover_data],
                        title='t-SNE 3D Visualization')
    fig.update_traces(marker=dict(size=3))
    # fig.show()
    fig.show(renderer="browser")

In [34]:
tsne = TSNE(n_components=3, perplexity=30, random_state=42)
data_draw_3d(tsne, datas_layer29, labels_int, color_map = None, hover_data = {"kind": task_kind_list, "data_id":data_id})

kind
data_id


#### 应用t-SNE降维到三维

In [29]:
tsne = TSNE(n_components=3, perplexity=30, random_state=42)
data_draw_3d(tsne, datas_layer29, labels_int, color_map = None, hover_data = {"kind": task_kind_list, "data_id":data_id})

kind
data_id


#### 应用主成分分析（PCA）降维到三维

In [30]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3, random_state=42)
data_draw_3d(pca, datas_layer29, labels_int, color_map = None, hover_data = {"kind": task_kind_list, "data_id":data_id})


kind
data_id


#### 应用多维缩放（MDS）降维到三维

In [36]:
from sklearn.manifold import MDS

# 应用MDS降维到三维
mds = MDS(n_components=3, random_state=42)
data_draw_3d(mds, datas_layer29, labels_int, color_map = None, hover_data = {"kind": task_kind_list, "data_id":data_id})





kind
data_id


## 接下来尝试动态可视化

#### 首先还是数据准备
显然，多数电脑在进行隐藏层序列可视化的时候，只能加载个别层数据

In [110]:
trans_datas_ml,trans_labels_ml = trans_data_fetcher.load_all_state_by_special_position(-1, 'user', (-15, 0), -1)

Progress: 99.61% Complete (1030/1034)


In [111]:
trans_datas_ml = trans_datas_ml.reshape(1034,trans_datas_ml.shape[1],4096)
trans_datas_ml = trans_datas_ml.transpose(1,0,2)
trans_labels_ml_int = np.array([int(y) if y=='0' or y=='1' else 0  for y in trans_labels_ml])
trans_datas_prompt_kind_list = np.array(trans_data_fetcher.get_all_task_prompt_index())

In [112]:
import plotly.graph_objs as go
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
def data_draw_3d_by_tokens(dime_re_tool, datas_array, labels_int_list, color_map = None, hover_data = {}):
    fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'scatter3d'}]])
    ori_shape = datas_array.shape
    trans_datas_3d_all = dime_re_tool.fit_transform(datas_array.reshape(-1,4096))
    trans_datas_3d_all_dvid = trans_datas_3d_all.reshape(ori_shape[0],ori_shape[1],3)
    for time_step in range(trans_datas_3d_all_dvid.shape[0]):
        indices = np.array(range(trans_datas_3d_all_dvid.shape[1]))
        # Filter data points by class
        class_indices = {}
        unique_kinds = set(labels_int_list)
        # 遍历labels_int_list列表的唯一值
        for kind in unique_kinds:
            class_indices[kind] = labels_int_list == kind


        for kind in unique_kinds:
            hovertexts = [
                '<br>'.join([f'{key}: {hover_data[key][idx]}' for key in hover_data])
                for idx in indices[class_indices[kind]]
            ]
            # Data for class 0 at the current time step
            fig.add_trace(
                go.Scatter3d(
                    x=trans_datas_3d_all_dvid[time_step, class_indices[kind], 0],
                    y=trans_datas_3d_all_dvid[time_step, class_indices[kind], 1],
                    z=trans_datas_3d_all_dvid[time_step, class_indices[kind], 2],
                    mode='markers',
                    marker=dict(size=3),
                    hovertext=hovertexts,
                    name=f'Class {kind}',
                    visible=(time_step == 0)  # Only the first time step is visible initially
                )
            )
    
    # Create the slider steps, one for each time step
    steps = []
    for i in range(trans_datas_3d_all_dvid.shape[0]):
        step = dict(
            method='update',
            args=[{'visible': [False] * trans_datas_3d_all_dvid.shape[0] * 2}],
            label=f'Time step {i+1}'
        )
        step['args'][0]['visible'][i*2] = True  # Toggle i-th trace to "visible"
        step['args'][0]['visible'][i*2 + 1] = True  # Toggle i-th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=0,
        currentvalue={"prefix": "Time step: "},
        pad={"t": 50},
        steps=steps
    )]

    fig.update_layout(
        sliders=sliders
    )

    # Set axis properties to make them visible
    fig.update_layout(scene=dict(
        xaxis=dict(
            title='X Axis',
            showbackground=True,
            backgroundcolor="rgb(230, 230,230)",
            gridcolor="white",
            showline=True,
            zeroline=True,
            showgrid=True,
            showticklabels=True
        ),
        yaxis=dict(
            title='Y Axis',
            showbackground=True,
            backgroundcolor="rgb(230, 230,230)",
            gridcolor="white",
            showline=True,
            zeroline=True,
            showgrid=True,
            showticklabels=True
        ),
        zaxis=dict(
            title='Z Axis',
            showbackground=True,
            backgroundcolor="rgb(230, 230,230)",
            gridcolor="white",
            showline=True,
            zeroline=True,
            showgrid=True,
            showticklabels=True
        ),
        aspectmode='cube'  # This can be "data", "cube", "auto", "manual"
    ))
    fig.show(renderer="browser")


    

In [113]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3, random_state=42)
tsne = TSNE(n_components=3, perplexity=30, random_state=42)
# tmpdata = tsne.fit_transform(trans_datas_ml.reshape(-1,4096))

In [114]:

data_draw_3d_by_tokens(pca, trans_datas_ml, trans_labels_ml_int, color_map = None, hover_data = {"id":np.array(range(1034)),"prompt_kind":trans_datas_prompt_kind_list})