In [1]:
import pandas as pd
import logging
import os

# 创建一个日志记录器
logger = logging.getLogger('visual_logger')
logger.setLevel(logging.INFO)

# 创建一个控制台处理程序，将日志显示在终端
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# 创建日志格式器
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(log_formatter)

# 将处理程序添加到记录器
logger.addHandler(console_handler)
# 假设数据集文件名为 'IMDB Dataset.csv'
file_path = 'datas\\IMDB_Dataset.csv'

# 使用pandas读取CSV文件
df = pd.read_csv(file_path)

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
ori_hidden_state_path = "E:\\github_project\\ChatGLM3\\THUDM\\hids"
target_hidden_state_path = "./datas/IMDB_hids_no_extra_prompt"

In [19]:
import sys
from IPython.display import display
from Taotie.agent_use import ChatAgent
# TaskDataGenerator可以向本地chatglm3-6b批量发送任务请求，并将任务推理过程中的相关数据（对话内容与隐藏层状态参数）保存整理
class TaskDataGenerator():
    def __init__(self, task_pattern, local_model = "chatglm3-6b"):
        self.task_pattern = task_pattern
        self.model_name = local_model
        self.task_prompts = None
        self.text_prompts = None
        self.ori_hidden_state_path = ori_hidden_state_path
        self.target_hidden_state_path = target_hidden_state_path
        self.processing_text_index = 0
        self.agent = ChatAgent(self.model_name)
        logger.info('using local model: {self.model_name}')
        if not os.path.exists(self.target_hidden_state_path):
            os.mkdir(self.target_hidden_state_path)

    def start_fetch(self, start_index, end_index):
        total = end_index - start_index
        for i in range(start_index, end_index):
            self.processing_text_index = i
            self.fetch_one_date()

            progress_percentage = ((i - start_index + 1) / total) * 100
            print(f"\rProgress: {self.processing_text_index - start_index}/{total} \t {progress_percentage:.2f}%", end="")

            # 可选：如果需要在一行中刷新输出，可以添加
            sys.stdout.flush()

    def fetch_one_date(self):
        self.check_ori_hidden_state_path_empty()
        # 取出特定位置的内容来进行编码
        self.agent.prompt_add(df.iloc[self.processing_text_index]['review'])
        result = self.agent.prompt_post(maxtokens=1)
        # 代理初始化
        self.agent.messages = []
        self.hidden_state_move()

    def check_ori_hidden_state_path_empty(self):
        # 检查self.ori_hidden_state_path文件夹中是否为空
        return not os.listdir(self.ori_hidden_state_path)

    def hidden_state_move(self):
        # 将参数文件移入具体的index对应文件夹中
        for file in os.listdir(self.ori_hidden_state_path):
            os.rename(os.path.join(self.ori_hidden_state_path, file), os.path.join(self.target_hidden_state_path,f'{str(self.processing_text_index)}'))
    

In [20]:
test_TaskDataGenerator = TaskDataGenerator(task_pattern="datas/translator")
# 根据 en_ch_ori_data_processed.txt 文件前1000行文本执行翻译任务
# test_TaskDataGenerator.start_fetch(33875, 50000)

2023-12-05 20:38:36,147 - my_logger - INFO - api_base:openai	llm version: chatglm3-6b
2023-12-05 20:38:36,151 - visual_logger - INFO - using local model: {self.model_name}


Progress: 16124/16125 	 100.00%

In [33]:
import numpy as np
tmp = np.load(f"{target_hidden_state_path}/0")

In [37]:
np.array(list(tmp.values())).size

4096

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import clear_output

file_path = 'datas\\IMDB_Dataset.csv'

# 使用pandas读取CSV文件
df = pd.read_csv(file_path)

# Splitting the DataFrame into training and test sets (80% training, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['sentiment'], random_state=42)

# Function to load a single feature from npz file
def load_feature(id, folder_path, index):
    file_path = os.path.join(folder_path, id)
    data_f = np.load(file_path)
    result = np.array(list(data_f.values()))
    data_f.close()
    return result, index
# Function to load features in parallel and show progress bar
def load_features_parallel(ids, folder_path):
    """Load features concurrently using ThreadPoolExecutor."""
    count = 0
    id_num = len(ids)
    features = [None] * id_num
    with ThreadPoolExecutor(max_workers=128) as executor:
        # Creating a list of futures
        futures = [executor.submit(load_feature, str(file_id), folder_path, index) for index,file_id in enumerate(ids)]
        # Progress bar for loading features
        for future in as_completed(futures):
            result, index = future.result()
            count += 1
            features[index] = result
            if count % 50 == 0:
                percent_complete = (count / id_num) * 100
                clear_output(wait=True)
                print(f"Progress: {percent_complete:.2f}% Complete ({count}/{id_num})")

    return np.array(features)

# Folder path containing the feature files
folder_path = "datas\\IMDB_hids_no_extra_prompt"

# # Loading features for the training and test sets
# X_train = load_features_parallel(train_df.index, folder_path)
# X_test = load_features_parallel(test_df.index, folder_path)

# # Labels for training and test sets
# y_train = train_df['sentiment'].values
# y_test = test_df['sentiment'].values


In [2]:
%%writefile test.py
import numpy as np
def process_file(file_id):
    """处理单个文件，返回合并后的array"""
    file_path = f'datas\\IMDB_hids_no_extra_prompt\\{file_id}'  # 指定文件夹路径
    data = np.load(file_path)
    # 合并所有arrays为一个array
    merged_array = np.array([data[f'arr_{i}'] for i in range(4096)])
    data.close()
    np.save(f'datas\\npdatas\\{file_id}.npy', merged_array)
    return merged_array




Overwriting test.py


In [2]:
%%writefile loadone.py
import numpy as np
import os
def load_feature(id_and_folder_path):
    id, folder_path = id_and_folder_path
    file_path = os.path.join(folder_path, str(id))
    data_f = np.load(file_path)
    result = np.array(list(data_f.values()))
    data_f.close()
    return result, id

Overwriting loadone.py


In [4]:
import loadone
from multiprocessing.pool import Pool
from tqdm import tqdm  # 导入 tqdm
import numpy as np

folder_path = "datas\IMDB_hids_no_extra_prompt"
pool = Pool(30)
task_num = 50000
features = [None] * task_num
arges = [(i, folder_path) for i in range(task_num)]
# 使用 tqdm 包装 inputs，以显示进度条
for result,id in tqdm(pool.imap_unordered(loadone.load_feature, arges), total=task_num):
    features[id] = result

pool.close()  # 关闭进程池，不再接受新的进程
pool.join()   # 主进程阻塞等待子进程的退出
np.save('datas\\50000.npy', np.array(features))


100%|██████████| 50000/50000 [1:11:17<00:00, 11.69it/s]


In [2]:
import numpy as np
features_arr = np.load('datas\\50000.npy')

In [3]:
features_arr.shape

(50000, 4096)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import clear_output

In [6]:


file_path = 'datas\\IMDB_Dataset.csv'

# 使用pandas读取CSV文件
df = pd.read_csv(file_path)

# Splitting the DataFrame into training and test sets (80% training, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['sentiment'], random_state=42)

# Folder path containing the feature files
folder_path = "datas\\IMDB_hids_no_extra_prompt"

# # Loading features for the training and test sets
X_train = features_arr[train_df.index]
X_test = features_arr[test_df.index]

# # Labels for training and test sets
y_train = train_df['sentiment'].values
y_test = test_df['sentiment'].values


In [17]:
y_train_binary = np.array([1 if y == 'positive' else 0 for y in y_train])
y_test_binary = np.array([1 if y == 'positive' else 0 for y in y_test])

In [11]:
X_train.shape

(40000, 4096)

#### 支持向量机

In [12]:
# SVM classifier
svm_classifier = SVC()

# Training the SVM classifier
svm_classifier.fit(X_train, y_train)

#### 随机森林

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, verbose=2)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred))


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:  4.6min


building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.0s


Accuracy: 0.912
              precision    recall  f1-score   support

    negative       0.92      0.90      0.91      5000
    positive       0.91      0.92      0.91      5000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



#### 简单DNN

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


In [49]:
class Net_DNN(nn.Module):
    def __init__(self):
        super(Net_DNN, self).__init__()
        self.fc1 = nn.Linear(4096, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, 2)  # 假设是二分类问题
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


In [12]:
y_train

array(['positive', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [40]:
# 假设 X_train, y_train, X_test, y_test 已经是 NumPy 数组
train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train_binary))
test_dataset = TensorDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test_binary))

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)


In [50]:
model = Net_DNN()  # 确保你已经定义了 Net 类

In [51]:
import torch
import matplotlib.pyplot as plt

# 检查是否有可用的 GPU，如果有则使用 GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 定义模型


# 将模型移至 GPU
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
train_losses = []
train_accuracies = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in train_loader:
        # 将数据也移动到相同的设备
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = correct_preds / total_preds
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}, Accuracy: {epoch_acc}')

plt.plot(range(num_epochs), train_losses, label='Training Loss')
plt.plot(range(num_epochs), train_accuracies, label='Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss/Accuracy')
plt.legend()
plt.show()


Using device: cuda:0
Epoch 1/50, Loss: 0.2562997565627098, Accuracy: 0.8976
Epoch 2/50, Loss: 0.18520880760550498, Accuracy: 0.93695
Epoch 3/50, Loss: 0.17346595330238343, Accuracy: 0.941525
Epoch 4/50, Loss: 0.17202396770119668, Accuracy: 0.9415
Epoch 5/50, Loss: 0.16511010469794274, Accuracy: 0.944525
Epoch 6/50, Loss: 0.16351269933879375, Accuracy: 0.9446
Epoch 7/50, Loss: 0.16013710581362248, Accuracy: 0.9468
Epoch 8/50, Loss: 0.15789192883968353, Accuracy: 0.94505
Epoch 9/50, Loss: 0.15947627963721753, Accuracy: 0.94585
Epoch 10/50, Loss: 0.1522055113583803, Accuracy: 0.947825
Epoch 11/50, Loss: 0.15028146826922895, Accuracy: 0.947475
Epoch 12/50, Loss: 0.15590590167939664, Accuracy: 0.9464
Epoch 13/50, Loss: 0.14630998320877553, Accuracy: 0.9485
Epoch 14/50, Loss: 0.14313860405683518, Accuracy: 0.9502
Epoch 15/50, Loss: 0.14641722333729268, Accuracy: 0.949675
Epoch 16/50, Loss: 0.1405715566277504, Accuracy: 0.951025
Epoch 17/50, Loss: 0.138761558842659, Accuracy: 0.95125
Epoch 18

KeyboardInterrupt: 

In [52]:
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        # 将数据移动到 GPU
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        # 将预测和标签移回 CPU，并转换为 NumPy 数组
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# 计算测试准确率
accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy: {accuracy}')


Test Accuracy: 0.9441


#### 简单CNN

In [68]:
import torch.nn as nn

class Net_CNN(nn.Module):
    def __init__(self):
        super(Net_CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=8, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(16384*4, 512)  # 这里的2048取决于卷积和池化层后的特征维度
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 2)  # 假设是二分类问题

    def forward(self, x):
        x = x.unsqueeze(1)  # 增加一个通道维度
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # 展平
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [69]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 假设 X_train, y_train 是您的训练数据和标签
# 将它们转换为 PyTorch 的 Tensor
train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train_binary))
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

# 检查是否有可用的 GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 定义模型
model = Net_CNN() # 使用您之前定义的 OneDCNN 类
model = model.to(device)  # 将模型转移到 GPU

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 50  # 或者您选择的任何其他次数
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # 将数据转移到 GPU

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # 计算预测的准确性
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    # 计算并打印每个 epoch 的平均损失和准确率
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = correct_preds / total_preds
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}, Accuracy: {epoch_acc}')



Using device: cuda:0
Epoch 1/50, Loss: 0.3562357508420944, Accuracy: 0.88775
Epoch 2/50, Loss: 0.17190107627511025, Accuracy: 0.935225
Epoch 3/50, Loss: 0.15371439773440362, Accuracy: 0.94275
Epoch 4/50, Loss: 0.1383519442409277, Accuracy: 0.9487
Epoch 5/50, Loss: 0.13238102878928185, Accuracy: 0.94985
Epoch 6/50, Loss: 0.12078520303964616, Accuracy: 0.953375
Epoch 7/50, Loss: 0.10907381809055805, Accuracy: 0.95895
Epoch 8/50, Loss: 0.10292542172074318, Accuracy: 0.9612
Epoch 9/50, Loss: 0.0996064413294196, Accuracy: 0.96225
Epoch 10/50, Loss: 0.08987109827548266, Accuracy: 0.96585
Epoch 11/50, Loss: 0.08665137266889214, Accuracy: 0.966425
Epoch 12/50, Loss: 0.08196888773366809, Accuracy: 0.968375
Epoch 13/50, Loss: 0.0770203395985067, Accuracy: 0.96885
Epoch 14/50, Loss: 0.07474115806370973, Accuracy: 0.970425
Epoch 15/50, Loss: 0.07252492879331111, Accuracy: 0.971825
Epoch 16/50, Loss: 0.0689916726782918, Accuracy: 0.973475
Epoch 17/50, Loss: 0.0667158260319382, Accuracy: 0.97335
Epo

In [71]:
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        # 将数据移动到 GPU
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)

        # 将预测和标签移回 CPU，并转换为 NumPy 数组
        test_preds.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# 计算测试准确率
accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.944
