In [None]:
!pwd

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cd /content

In [None]:
!which python

In [None]:
!pip -q install langchain huggingface_hub transformers sentence_transformers

In [None]:
!pip install torchkeras
!pip install torchmetrics

In [None]:
!pip install langchain
!pip install faiss-gpu

In [None]:
import os
import sys
import traceback

import torch
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

from transformers import AutoModel, AutoTokenizer

In [None]:
# 查询向量数据库返回结果的最大数量
TOK_K = 5

VECTOR_STORE_PATH = "vector_store"
DOCUMENT_NAME = "/content/drive/MyDrive/demo/document.txt"

LLM_DEVICE = 'cuda' if torch.cuda.is_available() else "cpu"
EMBEDDING_DEVICE = "cuda"
EMBEDDING_MODEL_NAME = "nghuyong/ernie-3.0-base-zh"
EMBEDDING_DEVICE = "cuda"
LLM_MODEL = "THUDM/chatglm-6b-int8"



In [None]:
def setup_embedding():
  embeddings = HuggingFaceEmbeddings(
      model_name=EMBEDDING_MODEL_NAME,
      model_kwargs={"device":EMBEDDING_DEVICE}
  )
  return embeddings

In [None]:
embedding = setup_embedding()

In [None]:
class DocumentTextSplitter(CharacterTextSplitter):
  def __init__(self, threshold: int, **kwargs):
    super().__init__(*kwargs)
    self.threshold = threshold

  def split_text(self, text: str):
    return [text]

In [None]:
loader = TextLoader(DOCUMENT_NAME, autodetect_encoding=True)
docs = loader.load_and_split(DocumentTextSplitter(100))

In [None]:
docs

In [None]:
vector_store = FAISS.from_documents(documents=docs, embedding=embedding)

In [None]:
vector_store

In [None]:
# setup_llm
device = torch.device(LLM_DEVICE)
model = AutoModel.from_pretrained(LLM_MODEL, trust_remote_code=True).half().to(device)
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, trust_remote_code=True)

In [None]:
!nvidia-smi

## 直接调用大模型

In [None]:
!pip install transformers==4.27.1

In [None]:
!pip install cpm_kernels

In [None]:
!pip install icetk

In [None]:
#下载模型放在chatglm-6b-int4文件夹内
!git clone -b int4 https://huggingface.co/THUDM/chatglm-6b.git chatglm-6b-int4

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
model_name = "chatglm-6b-int4"

In [None]:
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).half().cuda()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [None]:
prompt = """文本分类任务：判断（）中的话是否是“有没有更好的价格”的意思。
下面是一些范例：
请问是否有更好的价格 -> 是
价格能否可以再好一些 -> 是
3.2这个价格可以吗 -> 否
请问有比3.2更好的价格吗 -> 是
有没有好的价格出 -> 是
请问4.5这个价格好吗 -> 否
请对下述句子进行分类。返回是'或者'否'，无需其它说明和解释，不用返回原话。
xxxxxx ->
"""
def get_prompt(text):
  return prompt.replace("xxxxxx", text)

In [None]:
input_text = "价格能不能再好一些"
response, his = model.chat(tokenizer, get_prompt(input_text), history=[])
print(response)
print(his)

## 使用bert进行文本分类

In [None]:
!pip install transformers

In [None]:
!pip install datasets

### 1. 数据加载

In [None]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader

import datasets

In [None]:
df = pd.read_csv("/content/drive/MyDrive/demo/data/tradingDirection.csv")

In [None]:
df

In [None]:
df["labels"] = 0
# df[df["intent"]=="askPrice"].loc[:,"labels"] = 1
df[df["tradingDirection"]=="sell"].loc[:, "labels"] == 1

In [None]:
df

In [None]:
df.loc[df[df["tradingDirection"]=="sell"].index, ["labels"]] = 1

In [None]:
df["content"] = df["content"].astype("str")

In [None]:
ds = datasets.Dataset.from_pandas(df)

In [None]:
ds = ds.shuffle(42)
ds = ds.rename_columns({"content": "text"})

In [None]:
df

### 2. 文本分词

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
print(tokenizer)

In [None]:
text_codes = tokenizer(text="你有bid吗", text_pair=None,
                       max_length=100,
                       truncation=True,
                       padding="do_not_pad")
print(text_codes)

In [None]:
tokens = tokenizer.tokenize(ds["text"][1])
tokens

In [None]:
ds["text"][1]

In [None]:
ds

### 3、传入DataLoader

In [None]:
ds_encoded = ds.map(lambda example: tokenizer(example["text"],
                                              max_length=50,
                                              truncation=True,
                                              padding="max_length"
                                        ),
                    batched=True,batch_size=10,num_proc=2) # 支持批处理和多进程map

In [None]:
ds_encoded

In [None]:
# 转换成pytorch中的tensor
ds_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
ds_encoded[0]

In [None]:
ds_train_val, ds_test = ds_encoded.train_test_split(test_size=0.2).values()
ds_train, ds_val = ds_train_val.train_test_split(test_size=0.2).values()

In [None]:
def collate_fn(examples):
  return tokenizer.pad(examples)

In [None]:
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=8, collate_fn=collate_fn)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=8, collate_fn=collate_fn)
dl_test = torch.utils.data.DataLoader(ds_test, batch_size=8, collate_fn=collate_fn)

In [None]:
for batch in dl_train:
  break

In [None]:
batch

### 4、定义模型

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=2)
dict(model.named_children())

In [None]:
output = model(**batch)

In [None]:
output

### 5、训练模型

In [None]:
!pip install torchkeras

In [None]:
from torchkeras import KerasModel

class StepRunner:
  def __init__(self, net, loss_fn, accelerator, stage="train", metrics_dict=None,
               optimizer=None, lr_scheduler=None):
    self.net, self.loss_fn, self.metrics_dict, self.stage = net, loss_fn, metrics_dict, stage
    self.optimizer, self.lr_scheduler = optimizer, lr_scheduler
    self.accelerator = accelerator
    if self.stage == "train":
      self.net.train()
    else:
      self.net.eval()

  def __call__(self, batch):
    out = self.net(**batch)
    loss = out.loss

    preds = (out.logits).argmax(axis=1)

    if self.optimizer is not None and self.stage=="train":
      self.accelerator.backward(loss)
      self.optimizer.step()
      if self.lr_scheduler is not None:
        self.lr_scheduler.step()
      self.optimizer.zero_grad()

    all_loss = self.accelerator.gather(loss).sum()

    labels = batch["labels"]
    acc = (preds==labels).sum()/((labels>-1).sum())

    all_acc = self.accelerator.gather(acc).sum()

    # losses
    step_losses = {self.stage+"_loss":all_loss.item(), self.stage+"_acc":all_acc.item()}

    step_metrics = {}
    if self.stage == "train":
      if self.optimizer is not None:
        step_metrics["lr"] = self.optimizer.state_dict()["param_groups"][0]["lr"]
      else:
        step_metrics["lr"] = 0.0
    return step_losses, step_metrics

In [None]:
KerasModel.StepRunner = StepRunner
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)


In [None]:
keras_model = KerasModel(model, loss_fn=None, optimizer=optimizer)

In [None]:
keras_model.fit(
    train_data=dl_train,
    val_data=dl_val,
    ckpt_path="/content/drive/MyDrive/demo/ckpt.pt",
    epochs=50,
    patience=10,
    monitor="val_acc",
    mode="max",
    plot=True,
    # wandb=False,
    quiet=True
)

### 6、评估模型

In [None]:
!pip install evaluate

In [None]:
import evaluate
metrics = evaluate.load("accuracy")
model.eval()
model.to("cuda")
dl_test = keras_model.accelerator.prepare(dl_test)
for batch in dl_test:
  with torch.no_grad():
    outputs = model(**batch)

  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  metrics.add_batch(predictions=predictions, references=batch["labels"])

metrics.compute()

In [None]:
ds_test["text"]

In [None]:
ds_test

### 7、使用模型

In [None]:
texts = ["你有bid吗", "您有tkn吗", "您有什么净价的bid"]

In [None]:
batch = tokenizer(texts, padding=True, return_tensors="pt")
batch = {k:v.to(keras_model.accelerator.device) for k, v in batch.items()}

In [None]:
from torch import nn
logits = model(**batch).logits
scores = nn.Softmax(dim=-1)(logits)[:,-1]
print(scores)

In [None]:
logits

In [None]:
# 可以用pipeline将tokenizer和model组装再一起
from transformers import pipeline
classifier = pipeline(task="text-classification", tokenizer=tokenizer, model=model.cpu())
classifier("您有什么价格的bid嘛")

In [None]:
!pip list|grep transformers

### 保存模型

In [None]:
model.config.id2label = {0:"buy", 1:"sell"}
model.save_pretrained("/content/drive/MyDrive/demo/tradingDirection")
tokenizer.save_pretrained("/content/drive/MyDrive/demo/tradingDirection")

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
Col

In [3]:
from transformers import pipeline
classifier = pipeline("text-classification", model="/content/drive/MyDrive/demo/tradingDirection")

In [5]:
classifier("给一个好的bid呗")

[{'label': 'sell', 'score': 0.9982630610466003}]

## 方向判断

## 使用bert进行命名实体识别

In [6]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
## ras学习

In [None]:
!pwd

In [None]:
%cd /content/drive/MyDrive

In [None]:
!mkdir rasa-test

In [None]:
%cd rasa-test

In [None]:
!pip install rasa

In [None]:
!pip install -U ipython

In [None]:
!rasa init