# Bert-Base-Chinese模型微调

# data-process

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import Trainer,TrainingArguments, BertTokenizer, BertModel, BertPreTrainedModel,BertConfig
from torch.utils.data import Dataset, DataLoader
from torch import nn
import warnings
warnings.filterwarnings('ignore')
import sys
sys.setrecursionlimit(3000)


def read_data(data_dir):
    data = pd.read_csv(data_dir)
    data['content'] = data['content'].fillna('')
    data['text'] = data['content']+data['level_1']+data['level_2']+data['level_3']+data['level_4']
    return data

def fill_paddings(data, maxlen):
    '''补全句长'''
    if len(data) < maxlen:
        pad_len = maxlen-len(data)
        paddings = [0 for _ in range(pad_len)]
        data = torch.tensor(data + paddings)
    else:
        data = torch.tensor(data[:maxlen])
    return data

class InputDataSet():

    def __init__(self,data,tokenizer,max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self,):
        return len(self.data)

    def __getitem__(self, item):  # item是索引 用来取数据
        text = str(self.data['text'][item])
        labels = self.data['label'][item]
        labels = torch.tensor(labels, dtype=torch.long)

        ## 手动构建
        tokens = self.tokenizer.tokenize(text)
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids = [101] + tokens_ids + [102]
        input_ids = fill_paddings(tokens_ids,self.max_len)

        attention_mask = [1 for _ in range(len(tokens_ids))]
        attention_mask = fill_paddings(attention_mask,self.max_len)

        token_type_ids = [0 for _ in range(len(tokens_ids))]
        token_type_ids = fill_paddings(token_type_ids,self.max_len)

        return {
            'text':text,
            'input_ids':input_ids,
            'attention_mask':attention_mask,
            'token_type_ids':token_type_ids,
            'labels':labels

        }


if __name__ == '__main__':
    train_dir = 'data/train.csv'
    dev_dir = 'data/dev.csv'
    model_dir = 'bert-base-chinese'
    train = read_data(train_dir)
    test = read_data(dev_dir)
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    train_dataset = InputDataSet(train,tokenizer=tokenizer, max_len=128)
    train_dataloader = DataLoader(train_dataset,batch_size=4)
    batch = next(iter(train_dataloader))

    print(batch)
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['token_type_ids'].shape)
    print(batch['labels'].shape)

  from .autonotebook import tqdm as notebook_tqdm


{'text': ['使用移动手动电动工具,外接线绝缘皮破损,应停止使用.工业/危化品类（现场）—2016版（二）电气安全6、移动用电产品、电动工具及照明1、移动使用的用电产品和I类电动工具的绝缘线，必须采用三芯(单相)或四芯(三相)多股铜芯橡套软线。', '一般工业/危化品类（现场）—2016版（一）消防检查1、防火巡查3、消防设施、器材和消防安全标志是否在位、完整；', '消防知识要加强工业/危化品类（现场）—2016版（一）消防检查2、防火检查6、重点工种人员以及其他员工消防知识的掌握情况；', '消防通道有货物摆放 清理不及时工业/危化品类（现场）—2016版（一）消防检查1、防火巡查3、消防设施、器材和消防安全标志是否在位、完整；'], 'input_ids': tensor([[ 101,  886, 4500, 4919, 1220, 2797, 1220, 4510, 1220, 2339, 1072,  117,
         1912, 2970, 5296, 5318, 5357, 4649, 4788, 2938,  117, 2418,  977, 3632,
          886, 4500,  119, 2339,  689,  120, 1314, 1265, 1501, 5102, 8020, 4385,
         1767, 8021,  100, 8112, 4276, 8020,  753, 8021, 4510, 3698, 2128, 1059,
          127,  510, 4919, 1220, 4500, 4510,  772, 1501,  510, 4510, 1220, 2339,
         1072, 1350, 4212, 3209,  122,  510, 4919, 1220,  886, 4500, 4638, 4500,
         4510,  772, 1501, 1469,  151, 5102, 4510, 1220, 2339, 1072, 4638, 5318,
         5357, 5296, 8024, 2553, 7557, 7023, 4500,  676, 5708,  113, 1296, 4685,
          1

# modeling

In [2]:
#from data_process import read_data,InputDataSet
from transformers import Trainer,TrainingArguments, BertTokenizer, BertModel, BertPreTrainedModel,BertConfig
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
import torch

## 做句子的分类 BertForSequence
class BertForSeq(BertPreTrainedModel):

    def __init__(self,config):  ##  config.json
        super(BertForSeq,self).__init__(config)
        self.config = BertConfig(config)
        self.num_labels = 2 # 类别数目
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
            self,
            input_ids,
            attention_mask = None,
            token_type_ids = None,
            labels = None,
            return_dict = None
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        ## loss损失 预测值preds
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=return_dict
        )  ## 预测值

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        ## logits -—— softmax层的输入（0.4， 0.6）--- 1
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))  # 二分类任务 这里的参数要做view
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,  ##损失
            logits=logits,  ##softmax层的输入，可以理解为是个概率
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


if __name__ == '__main__':

    ## 加载编码器和模型
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BertForSeq.from_pretrained('bert-base-chinese')
    ## 准备数据
    dev = read_data('data/dev.csv')
    dev_dataset = InputDataSet(dev,tokenizer=tokenizer,max_len=128)
    dev_dataloader = DataLoader(dev_dataset,batch_size=4,shuffle=False)
    ## 把数据做成batch
    batch = next(iter(dev_dataloader))
    ## 设置device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    ## 输入embedding
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    labels = batch['labels'].to(device)
    ## 预测
    model.eval()
    ## 得到输出
    outputs = model(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)
    ## 取输出里面的loss和logits
    logits = outputs.logits
    loss = outputs.loss

    print(logits)
    print(loss.item())

    preds = torch.argmax(logits,dim=1)
    print(preds)

Some weights of BertForSeq were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[-0.8469,  0.3117],
        [-0.9175,  0.2637],
        [-1.0491,  0.2528],
        [-1.2354, -0.4336]], device='cuda:0', grad_fn=<AddmmBackward0>)
1.398838996887207
tensor([1, 1, 1, 1], device='cuda:0')


# train_and_eval文件

In [32]:
import time
import numpy as np
from torch import nn
import time
import os
import torch
import logging
from torch.optim import AdamW
from transformers import Trainer, TrainingArguments, BertTokenizer, BertModel, BertPreTrainedModel, BertConfig, \
    get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from transformers.utils.notebook import format_time
#from modeling import BertForSeq
#from data_process import InputDataSet,read_data,fill_paddings

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(batch_size,EPOCHS):

    model = BertForSeq.from_pretrained('bert-base-chinese')

    train = read_data('data/train.csv')
    val = read_data('data/dev.csv')
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

    train_dataset = InputDataSet(train, tokenizer, 128)
    val_dataset = InputDataSet(val, tokenizer, 128)

    train_dataloader = DataLoader(train_dataset,batch_size)
    val_dataloader = DataLoader(val_dataset,batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5)  #AdamW优化器主用
    total_steps = len(train_dataloader) * EPOCHS  # len(dataset)*epochs / batchsize
    #在BERT微调中，常常使用Warmup策略来在训练初期逐渐增加学习率，以更好地适应新的任务。例如，可以先使用较小的学习率进行预热（warmup），然后再进行学习率衰减。
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps) #warmup调度器
    total_t0 = time.time()

    log = log_creater(output_dir='./cache/logs/')

    log.info("   Train batch size = {}".format(batch_size))
    log.info("   Total steps = {}".format(total_steps))
    log.info("   Training Start!")

    for epoch in range(EPOCHS):
        total_train_loss = 0
        t0 = time.time()
        model.to(device)
        model.train()
        for step, batch in enumerate(train_dataloader):

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            model.zero_grad() #for循环中梯度清0

            outputs = model(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)  #进行梯度剪裁,防止梯度爆炸.
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)
        train_time = format_time(time.time() - t0)

        log.info('====Epoch:[{}/{}] avg_train_loss={:.5f}===='.format(epoch+1,EPOCHS,avg_train_loss))
        log.info('====Training epoch took: {:}===='.format(train_time))
        log.info('Running Validation...')

        model.eval()
        avg_val_loss, avg_val_acc = evaluate(model, val_dataloader)
        val_time = format_time(time.time() - t0)
        log.info('====Epoch:[{}/{}] avg_val_loss={:.5f} avg_val_acc={:.5f}===='.format(epoch+1,EPOCHS,avg_val_loss,avg_val_acc))
        log.info('====Validation epoch took: {:}===='.format(val_time))
        log.info('')

        if epoch == EPOCHS-1:
            torch.save(model,'model_stu.bin')
            print('Model Saved!')
    log.info('')
    log.info('   Training Completed!')
    print('Total training took{:} (h:mm:ss)'.format(format_time(time.time() - total_t0)))

def evaluate(model,val_dataloader):
    total_val_loss = 0
    corrects = []
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,labels=labels)

        logits = torch.argmax(outputs.logits,dim=1)
        ## 把每个batch预测的准确率加入到一个list中
        ## 在加入之前，preds和labels变成cpu的格式
        preds = logits.detach().cpu().numpy()
        labels_ids = labels.to('cpu').numpy()
        corrects.append((preds == labels_ids).mean())  ## [0.8,0.7,0.9]
        ## 返回loss
        loss = outputs.loss
        ## 把每个batch的loss加入 total_val_loss
        ## 总共有len(val_dataloader)个batch
        total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = np.mean(corrects)

    return avg_val_loss, avg_val_acc

def log_creater(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    log_name = '{}.log'.format(time.strftime('%Y-%m-%d-%H-%M'))
    final_log_file = os.path.join(output_dir, log_name)
    # creat a log
    log = logging.getLogger('train_log')
    log.setLevel(logging.DEBUG)

    # FileHandler
    file = logging.FileHandler(final_log_file)
    file.setLevel(logging.DEBUG)

    # StreamHandler
    stream = logging.StreamHandler()
    stream.setLevel(logging.DEBUG)

    # Formatter
    formatter = logging.Formatter(
        '[%(asctime)s][line: %(lineno)d] ==> %(message)s')

    # setFormatter
    file.setFormatter(formatter)
    stream.setFormatter(formatter)

    # addHandler
    log.addHandler(file)
    log.addHandler(stream)

    log.info('creating {}'.format(final_log_file))
    return log

if __name__ == '__main__':
    train(batch_size=16,EPOCHS=10)

Some weights of BertForSeq were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2024-01-09 21:05:47,582][line: 147] ==> creating ./cache/logs/2024-01-09-21-05.log
[2024-01-09 21:05:47,582][line: 147] ==> creating ./cache/logs/2024-01-09-21-05.log
[2024-01-09 21:05:47,586][line: 42] ==>    Train batch size = 16
[2024-01-09 21:05:47,586][line: 42] ==>    Train batch size = 16
[2024-01-09 21:05:47,589][line: 43] ==>    Total steps = 15000
[2024-01-09 21:05:47,589][line: 43] ==>    Total steps = 15000
[2024-01-09 21:05:47,591][line: 44] ==>    Training Start!
[2024-01-09 21:05:47,591][line: 44] ==>    Training Start!
[2024-01-09 21:06:54,314][line: 71] ==> ====Epoch:[1/20] avg_train_loss=0.11010====
[2024-01-09 21:06:54,314][line: 71] ==> ====Epoch:[1/20] avg_train_loss=0.11010====
[2024-01-09 21:0

Model Saved!
Total training took22:53 (h:mm:ss)


# Prediction

In [5]:
comments = pd.read_excel("comments_全.xlsx")

In [6]:
comments

Unnamed: 0,000001,000002,000004,000005,000006,000011,000012,000017,000020,000021,...,688777,688779,688786,688787,688788,688789,688798,688800,688819,688981
0,$平安银行(SZ000001)$,万科2022年销售4202亿，与保利的差距达到371亿了，保利实现了弯道超车，超过碧桂园也就...,什么时候年报预告？别又来一遍预报高增长，年报后就活埋了[捂脸],公司压根就没打算摘帽，继续耗着吧！等实控人股权拍卖再说。,这只股会破八！一定会,$深物业A(SZ000011)$特发信息昨晚也出公告了！深国资对深投控又一大动作！深赛格、英...,$南玻Ａ(SZ000012)$我司即将参与南玻集团的两个项目招投标预祝顺利中标！,你好，请问贵公司的锂电池产品是否对外出售？都有哪些产品，是否可以应用到新能源汽车,深圳本地的资产重组标的,开门红,...,中控技术12月30日被沪股通减持2.16万股,$长远锂科(SH688779)$这家公司值不值得你买呢？不多说很多上市公司你可能没怎么了解你...,悦安新材：融资净偿还8.43万元，融资余额1.11亿元（12-30）,海天瑞声：融资净买入66.86万元，融资余额3604.98万元（12-30）,$科思科技(SH688788)$欺骗深交所！中天国富证券保荐科隆新能失败，章敬富、钟亚桢收监管函,宏华数科：融资净偿还32.41万元，融资余额6359.93万元（12-30）,艾为电子：融资净买入21.54万元，融资余额1.56亿元（12-30）,瑞可达12月30日获沪股通增持4.28万股,天能股份：融资余额2.33亿元，创历史新低（12-30）,中芯国际总结2022，展望2023：
1,$日丰股份(SZ002953)$继续吧$平安银行(SZ000001)$,"万科“裁员”传闻背后:南方区被上海区业绩赶超,郁亮“唱多”下业绩压力仍在",骂我的难道你们都是国华的忠诚散户亏损都在百分之五十以上吗哈哈。一说挂星退市跟仇人一样。谁都想...,几个小律师忙坏了,$深振业Ａ(SZ000006)$出货了，都小心点,$深物业A(SZ000011)$不减持是好还是不好,"$南玻Ａ(SZ000012)$000012,南玻A,凡是资金连断了欠债的走势都不行看下复星糸...",个人预感深中华A是2023年度深市最大的黑马；,总共才几个钱 不至于操碎了心呀,真正的垃圾是看不见的垃圾，到公司看看就知道什么叫垃圾了,...,中控技术：连续6日融资净偿还累计2934.85万元（12-30）,$长远锂科(SH688779)$$欣旺达(SZ300207)$这两个小垃圾选择老技术路线等着...,大宗交易：悦安新材成交1459.4万元，折价20.02%（12-30）,海天瑞声：融资净偿还86.35万元，融资余额3538.12万元（12-29）,$科思科技(SH688788)$牛掰了，上跌幅榜了，恭喜恭喜！,宏华数科：融资净买入5.44万元，融资余额6392.34万元（12-29）,虚拟的题材，巨客的科研投入一年能跌去六成剩4成股价的股票，不是一般的差，而是超级的差，不彻底...,瑞可达：连续3日融资净买入累计522.05万元（12-30）,$天能股份(SH688819)$董秘，我于22年七月下旬以旧换新，一组贵公司电池72v32a...,$中芯国际(SH688981)$维持买入，继续增持，2023加油干，走翻番行情……
2,什么时侯上16,就不退,恭喜st国华的散户今年亏损50.15%明年再接再厉争取退市。,谁知道年报预约时间啊,我已经满仓300万股深振业，成本5元左右,$深物业A(SZ000011)$央行货币政策委员会召开2022年第四季度例会：-要坚持稳字当...,南玻A12月30日获深股通增持78.86万股,$深中华A(SZ000017)$祝各位000017股友新年身体健康，股票大赚！,贵公的董秘还不如的学生,这只股跌的会超越所有人想象，不信走着看,...,$中控技术(SH688777)$垃圾股，扶不起的阿斗，真差劲的股票,$长远锂科(SH688779)$据报道，2022年股民人均亏6万多。只有期待2023再出发。,这要减持到明年3月份,转发,科思科技：连续4日融资净偿还累计224.96万元（12-30）,大宗交易：宏华数科成交1188.44万元，折价9.14%（12-29）,艾为电子：融资净买入128.26万元，融资余额1.56亿元（12-29）,$瑞可达(SH688800)$下周发利好[呲牙],被迫做五年股东,$中芯国际(SH688981)$明年与h股价格看齐
3,说点吓人的言论吧，都说明年股市会涨10%～20%，我的观点正好相反，明年会跌10%～20%，...,600122就是特例,你好，董秘。请问1.公司近期是否申请并且获批“一种大功率快充液冷电桩电缆”实用新,$ST星源(SZ000005)$兄弟们明年见。明年破历史新高,明天放假,$深物业A(SZ000011)$昨天11.51买入，深物业A期待来年我们春暖花开！,转眼一年就要过去了，明年南玻能涨起来吗,[赞][赞],《8.16见》,深科技12月30日被深股通减持6.36万股,...,垃圾。。。,大垃圾，大垃圾，垃圾垃圾垃圾,悦安新材：融资净偿还38.25万元，融资余额1.11亿元（12-29）,海天瑞声：融资净买入60.21万元，融资余额3624.47万元（12-28）,$科思科技(SH688788)$希望公司越来越好，我还想在这里养老[笑哭],专注数码喷印三十年 宏华数科市占率遥遥领先,艾为电子董事程剑涛个人名下持股减少32万股 涉及金额2813.44万元,疯了吗？,$天能股份(SH688819)$看看去年就知道了马上解禁中信证券又要套现了,有一家公司，它的企业口号就是：不怕你抄底，就怕你不玩！它就是大名鼎鼎的国之重器---中芯国际！
4,平安银行的两个区间统计,这个板块是这样的,$ST国华(SZ000004)$一个点就提有意思吗？祝你们卖飞。看看容积二六三。不买不卖。,$ST星源(SZ000005)$年底立案调查的嘞太多了。科林全是申请摘帽被打脸造假最无耻的一个股票,还有最后一张免税牌照，又会花落谁家？深圳横琴,深物业A：减持计划实施时间已经届满 深投控在减持计划期间未通过集中竞价交易或大宗交易方式减持...,玩长线短线现在都不是卖的时候，静候市场企稳,$深中华A(SZ000017)$今天拉到4个点的时候还以为自己套飞了，看样子没有走稳之前还是...,明年见了,$深科技(SZ000021)$昕天上午涨跌收阳，我已凌晨4点就发文……这时就不是“关注了”…...,...,一买一卖盈亏只在一念之间,长远锂科12月30日获沪股通增持11.02万股,大宗交易：悦安新材成交461.44万元，折价20.48%（12-29）,海天瑞声：融资净买入23.04万元，融资余额3564.25万元（12-27）,科思科技：连续3日融资净偿还累计200.22万元（12-29）,宏华数科：融资净买入107.12万元，融资余额6386.9万元（12-28）,艾为电子核心技术人员张忠个人名下持股减少21万股 涉及金额1846.32万元,擦。买早了,2022最后一天，天能雄起！2023年天能股份还是回到发行价之上吧！说太多不现实的，不如实际的。,$中芯国际(SH688981)$2022年已收官，还是要往前看，好的事就是探到了谷底，这一年...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49645,,,,,,,,,,,...,,,,,,,,,,
49646,,,,,,,,,,,...,,,,,,,,,,
49647,,,,,,,,,,,...,,,,,,,,,,
49648,,,,,,,,,,,...,,,,,,,,,,


# 测试样例

In [34]:
# 加载预训练的BERT模型和tokenizer
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)


# 加载经过训练好的模型参数
model = torch.load('model_stu.bin')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 输入文本
text = "写的太美好了，值得大家去阅读，赞赞赞。"

# 使用tokenizer对文本进行编码
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
inputs.to(device)

# 使用模型进行预测
outputs = model(**inputs)

# 获取预测结果
predicted_class = torch.argmax(outputs.logits, dim=1).item()
print("Predicted class:", predicted_class)

Predicted class: 0


In [42]:
# 加载预训练的BERT模型和tokenizer
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)

# 加载经过训练好的模型参数
model = BertForSeq.from_pretrained('bert-base-chinese')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

bert_results= pd.DataFrame()
results = []
# 假设comments是一个包含所有评论的DataFrame，每一列代表一家公司的所有评论
# 对每一列评论进行情感分类预测

# 输入文本
texts = comments['000670'].tolist()

for text in texts:
    if pd.notnull(text):  # 检查是否为NaN值
        # 使用tokenizer对文本进行编码
        text = str(text)
        print(text)
        print(type(text))
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        inputs = inputs.to(device)  # 将输入移动到设备上

        # 使用模型进行预测
        outputs = model(**inputs)

        # 获取预测结果
        logits = torch.argmax(outputs.logits, dim=1)
        preds = logits.detach().cpu().numpy()
        
        result = np.squeeze(preds)
        results.append(result)
        results = [int(item) for item in results]      
        results = [-1 if item == 0 else item for item in results]
        
print(results)


Some weights of BertForSeq were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


这次票从普通里来讲还缺少一次大跌，等到那只大跌过后就是大涨之日。
<class 'str'>
$盈方微(SZ000670)$大家来新强联，停牌重组了，准备数板板了[呲牙]
<class 'str'>
老陈已经不愿意消耗筹码砸盘了于是派出死鸟牧牛犬等汪汪来忽悠散户割肉离开忽悠散户自己砸盘老陈再这个过程买进吸筹等老陈持筹数量达到预期就会操纵增发通过然后把股价拉到天上毕竟股价越低进来的资金也越多至于增发价太低被管理层否决只是一场戏配合老陈吸筹而已
<class 'str'>
$盈方微(SZ000670)$各位报一下你们的成本
<class 'str'>
$盈方微(SZ000670)$当时被拒的时候定增价格是多少？有知道的吗
<class 'str'>
$盈方微(SZ000670)$这么多人在买[想一下]，下个月肯定要跌一个星期了，等你们割肉离场我在进
<class 'str'>
$盈方微(SZ000670)$没想到当红渣子鸡变成了玩一分钱的僵尸股，
<class 'str'>
$盈方微(SZ000670)$明年什么时候拉到20块？[滴汗]
<class 'str'>
今天两市成交额6000亿，又降低了
<class 'str'>
$盈方微(SZ000670)$人家科华当时9元多，增发价是12，自己去看看吧，陈炎表，一个芯片股，现在的价格比有的st还低，不要脸。
<class 'str'>
公司还有董事会现在都换成老表的人了，虽然股份不多，但是实际已经控制公司了
<class 'str'>
纺织A现在是11元，增发是8.93，陈老表要点脸不？
<class 'str'>
$盈方微(SZ000670)$为什么你要托住不让下跌呢？直接一步到位
<class 'str'>
$盈方微(SZ000670)$盈方微入围2022年度十大牛股！恭喜各位买到牛股的兄弟！
<class 'str'>
$盈方微(SZ000670)$回踩6.8附近是买点。
<class 'str'>
2022.12.3美国公布了最新款战略轰炸机B21，最大特点就是航程长，载弹量增加，极低可探测性！然而我们的贵州茅台却说：尔等怕什么，我们有贵州茅台酒！
<class 'str'>
$盈方微(SZ000670)$9元的本什么时候能解套
<class 'str'>
公司回复重组定价符合相关规定，为什么被否？忽悠哪个？谢

KeyboardInterrupt: 

In [11]:
from joblib import dump, load

In [13]:
MultinomialNB_model = load( 'MultinomialNB_pipe.joblib')

In [14]:
#定义分词函数
def cut_comment(comment_text):
    r = jieba.lcut(comment_text)
    joined_text = " ".join(r)
    return joined_text

In [17]:
import jieba
jieba.load_userdict("userdict.txt")

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.624 seconds.
Prefix dict has been built successfully.


In [18]:
#定义函数统计情感词频
def count_sentiment_words(comment_text,sentiment):
    """
    基于情感词典，获取评论中对应的正面情感词。
    参数：
        comment_text：切词后的评论文本
        sentiment：是获取正面还是负面情感，1表示提取正面情感个数，0表示提取负面情感个数
    返回：
        评论中对应的情感词个数
    """
    comment_words_list = comment_text.split()
    
    if sentiment:
        return len([w for w in comment_words_list if w in positive_finance])
    else:
        return len([w for w in comment_words_list if w in negative_finance])

In [22]:
# 读取正面情感词汇
with open ("positive_finance.txt","rb") as f:
    positive_finance = f.readlines()
    positive_finance = [w.strip() for w in positive_finance]

In [21]:
# 读取负面情感词汇
with open ("negative_finance.txt","rb") as f:
    negative_finance = f.readlines()
    negative_finance = [w.strip() for w in negative_finance]

# 批量计算市场情绪因子

In [53]:
#批量计算市场情绪因子

# 加载预训练的BERT模型和tokenizer
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)

# 加载经过训练好的模型参数
model = BertForSeq.from_pretrained('bert-base-chinese')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

results = []


indexlist = []
sumlist = []
meanlist = []
stdlist = []
comments_list = []


for i in comments.columns:
    
    # 输入文本
    texts = comments[i].tolist()
    results = []
    for text in texts:
        if pd.notnull(text):  # 检查是否为NaN值
            text = str(text)
            # 使用tokenizer对文本进行编码
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            inputs = inputs.to(device)  # 将输入移动到设备上

            # 使用模型进行预测
            outputs = model(**inputs)

            # 获取预测结果
            logits = torch.argmax(outputs.logits, dim=1)
            preds = logits.detach().cpu().numpy()
            
            result = np.squeeze(preds)
            results.append(result)
            results = [int(item) for item in results]      
            results = [-1 if item == 0 else item for item in results]
    
    
    tmp = pd.DataFrame()
    tmp["comment"] = [str(k) for k in comments[i].tolist() if type(k)==str]
    tmp["cutted_comment"] = tmp.comment.apply(cut_comment)
    tmp["positive_words_count"] = tmp.cutted_comment.apply(count_sentiment_words,args = (1,))
    tmp["negative_words_count"] = tmp.cutted_comment.apply(count_sentiment_words,args = (0,))
    sentiment_total_score = tmp.positive_words_count - tmp.negative_words_count
    tmp["sentiment_label"] = sentiment_total_score.apply(lambda x: 1 if x>=0 else -1)
    
    print(len(results))
    print(len(tmp))
    
    # 让results的长度和tmp的长度相等
    if len(results) < len(tmp):
        results.extend([0] * (len(tmp) - len(results)))
        print('1')
    elif len(results) > len(tmp):
        results = results[:len(tmp)]
        print('0')
       
    tmp["sentiment_bert"] = results
    tmp["sentiment_MultinomialNB_clf"] = MultinomialNB_model.predict(tmp.cutted_comment)
    vote = tmp.sentiment_label + tmp.sentiment_bert + tmp.sentiment_MultinomialNB_clf 
    tmp["sentiment"] = vote.apply(lambda x: 1 if x>0 else -1)
    sen = list(tmp['sentiment'])
    print('第{}支股票{}的市场指数为{}，情绪总分为{}，均值为{}，标准差为{}'.format(list(comments.columns).index(i)+1, i, len(sen), sum(sen), np.mean(sen), np.std(sen)))
    indexlist.append(len(sen))
    sumlist.append(sum(sen))
    meanlist.append(np.mean(sen))
    stdlist.append(np.std(sen))
    words = []
    for w in tmp["cutted_comment"].tolist():
        words.extend(w.split())
    comments_list.extend(words)

Some weights of BertForSeq were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1793
1793
第1支股票000001的市场指数为1793，情绪总分为-313，均值为-0.17456776352481873，标准差为0.9846451624508917
3098
3098
第2支股票000002的市场指数为3098，情绪总分为-1160，均值为-0.37443511943189156，标准差为0.9272531161101726
416
416
第3支股票000004的市场指数为416，情绪总分为-246，均值为-0.5913461538461539，标准差为0.8064178360697146
535
535
第4支股票000005的市场指数为535，情绪总分为-323，均值为-0.6037383177570094，标准差为0.797182566086299
10407
10407
第5支股票000006的市场指数为10407，情绪总分为-7571，均值为-0.7274911117517056，标准差为0.686117105399849
1824
1824
第6支股票000011的市场指数为1824，情绪总分为-1274，均值为-0.6984649122807017，标准差为0.7156443015302446
1266
1266
第7支股票000012的市场指数为1266，情绪总分为-794，均值为-0.627172195892575，标准差为0.7788806305842286
835
835
第8支股票000017的市场指数为835，情绪总分为-609，均值为-0.7293413173652694，标准差为0.6841500148241562
118
118
第9支股票000020的市场指数为118，情绪总分为-72，均值为-0.6101694915254238，标准差为0.7922709079674742
1298
1298
第10支股票000021的市场指数为1298，情绪总分为-740，均值为-0.5701078582434514，标准差为0.8215698570231655
77
77
第11支股票000023的市场指数为77，情绪总分为-59，均值为-0.7662337662337663，标准差为0.6425619156806743
1427
1427
第12支股票000025的市场指数为1427，情绪总分为-917，均值

# 数据整合

In [54]:
basic_stock = pd.read_excel('基本面2022-3+股价数据.xlsx')
basic_stock['code'] = basic_stock.apply(lambda x:'0'*(6-len(str(x[0])))+str(x[0]),axis=1)

In [55]:
data= basic_stock.copy()
data['sen_index'] = indexlist
data['sen_score'] = sumlist
data['sen_mean'] = meanlist
data['sen_std'] = stdlist
data

Unnamed: 0,code,name,area,industry,market,list_date,eps,eps_yoy,bvps,roe,...,Volatility_15,Volatility_21,Volatility_30,Volatility_60,yield,label,sen_index,sen_score,sen_mean,sen_std
0,000001,平安银行,深圳,银行,主板,19910403,1.78,27.14,18.32,10.15,...,-0.038197,0.095625,0.488579,-0.174041,0.020360,1,1793,-313,-0.174568,0.984645
1,000002,万科A,深圳,全国地产,主板,19910129,1.47,2.35,,7.12,...,0.410888,1.492037,1.289951,1.546284,-0.011432,0,3098,-1160,-0.374435,0.927253
2,000004,ST国华,深圳,软件服务,主板,19910114,-0.38,-281.88,,-6.17,...,0.586980,-0.475211,0.376238,0.779977,0.010571,1,416,-246,-0.591346,0.806418
3,000005,ST星源,深圳,环境保护,主板,19901210,0.01,-94.82,,0.73,...,-0.803684,-0.234872,-0.404787,-0.545122,0.005435,1,535,-323,-0.603738,0.797183
4,000006,深振业A,深圳,区域地产,主板,19920427,0.15,-64.21,,2.61,...,-0.518529,-0.496677,-0.280033,-0.623720,0.014901,1,10407,-7571,-0.727491,0.686117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,688789,宏华数科,浙江,专用机械,科创板,20210708,2.45,-7.89,,11.86,...,0.316911,0.199722,-0.147357,1.353955,-0.022152,0,49,41,0.836735,0.547608
2795,688798,艾为电子,上海,半导体,科创板,20210816,0.33,-78.29,,1.45,...,0.520592,0.800187,-0.276064,1.049499,0.009927,1,134,-2,-0.014925,0.999889
2796,688800,瑞可达,江苏,元器件,科创板,20210722,1.82,121.95,,18.24,...,0.853940,0.064290,0.946856,0.515999,0.041214,1,107,59,0.551402,0.834240
2797,688819,天能股份,浙江,电气设备,科创板,20210118,1.47,32.43,,11.14,...,-0.106737,0.730579,-0.059349,0.860177,0.009397,1,419,-179,-0.427208,0.904154


In [56]:
data.to_excel('数据汇总.xlsx',index=False)

In [57]:
data.shape

(2799, 90)

In [58]:
data.info() #查看数据基本信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2799 entries, 0 to 2798
Data columns (total 90 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   code                   2799 non-null   object 
 1   name                   2799 non-null   object 
 2   area                   2799 non-null   object 
 3   industry               2799 non-null   object 
 4   market                 2799 non-null   object 
 5   list_date              2799 non-null   int64  
 6   eps                    2799 non-null   float64
 7   eps_yoy                2699 non-null   float64
 8   bvps                   22 non-null     float64
 9   roe                    2790 non-null   float64
 10  epcf                   14 non-null     float64
 11  net_profits            2799 non-null   float64
 12  profits_yoy            2799 non-null   float64
 13  distrib                13 non-null     object 
 14  report_date            2799 non-null   object 
 15  net_

In [59]:
data.describe() #描述性统计

Unnamed: 0,list_date,eps,eps_yoy,bvps,roe,epcf,net_profits,profits_yoy,net_profit_ratio,gross_profit_rate,...,Volatility_15,Volatility_21,Volatility_30,Volatility_60,yield,label,sen_index,sen_score,sen_mean,sen_std
count,2799.0,2799.0,2699.0,22.0,2790.0,14.0,2799.0,2799.0,2691.0,2691.0,...,2799.0,2799.0,2799.0,2799.0,2799.0,2799.0,2799.0,2799.0,2799.0,2799.0
mean,20116100.0,0.572315,-25.898951,10.390909,5.947441,-0.027143,87523.36,-8.67343,-4.980145,29.79155,...,0.1524,0.145596,0.174039,0.784931,-0.00885,0.287603,839.193641,-448.627724,-0.418762,0.86088
std,89871.34,1.578429,960.620823,6.5831,12.459442,5.024325,699489.7,1170.71422,586.048347,19.994751,...,1.000441,0.792784,0.81436,1.149763,0.038575,0.452726,1782.695103,1143.496714,0.271218,0.099973
min,19901210.0,-3.87,-32433.33,3.74,-128.88,-15.87,-1167312.0,-38054.33,-29597.21,-190.9458,...,-1.0,-1.0,-1.0,-1.0,-0.149551,0.0,10.0,-34338.0,-1.0,0.0
25%,20050530.0,0.07,-51.05,5.5625,1.99,0.1225,3025.685,-46.16,2.48,16.2678,...,-0.304504,-0.304808,-0.319351,0.020359,-0.028929,0.0,185.5,-452.5,-0.604141,0.794718
50%,20150130.0,0.29,-11.11,8.355,5.64,0.69,10693.4,-2.88,7.71,25.9144,...,-0.017541,-0.019309,-0.009652,0.537396,-0.014025,0.0,393.0,-180.0,-0.48366,0.871612
75%,20200210.0,0.66,22.22,12.6325,9.8275,1.8325,35007.96,30.82,15.25,39.70435,...,0.409118,0.397823,0.422294,1.228936,0.004275,1.0,858.0,-61.0,-0.30781,0.942809
max,20221030.0,35.34,21852.17,31.0,106.67,5.75,26582200.0,28652.39,909.54,100.0,...,36.972897,22.491265,9.411899,17.096272,0.331561,1.0,49570.0,1117.0,0.836735,1.0


# 评论数据爬虫

In [None]:
def craw(gb_id):
    cookies = {
        'qgqp_b_id': '9aafee8f9867edbe280a3884dea14c1d',
        'st_si': '39434491176058',
        'st_asi': 'delete',
        'show_app_box_time': '1672224935454',
        'st_pvi': '96267766858191',
        'st_sp': '2022-12-28%2018%3A39%3A33',
        'st_inirUrl': 'http%3A%2F%2Fguba.eastmoney.com%2F',
        'st_sn': '59',
        'st_psi': '20221228190006487-117016304298-2296127990',
    }
    
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Origin': 'http://mguba.eastmoney.com',
        'Referer': 'http://mguba.eastmoney.com/mguba/list/600000,f_1',
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36',
    }
    

    
    
    page=1
    
    cd=1
    lst=[]
    while cd>0:
      
        # time.sleep(1)
        
        # print(page)
        
        url='http://mguba.eastmoney.com/mguba2020/interface/GetData.aspx'

        data = {
          'param': f'code={gb_id}&p={page}&ps=20&sorttype=0',
          'plat': 'wap',
          'version': '200',
          'path': '/webexparticlelist/api/article/articlelist',
          'env': '1',
          'origin': '',
          'ctoken': '',
          'utoken': ''
        }
        try:
            response = requests.post(url, headers=headers, cookies=cookies, data=data, verify=False)
            
            sj=response.json()['re']
            
            cd=len(sj)
            
            # print(page)       
            
            try:
                end_post_publish_time=sj[-1]['post_publish_time']
            except:
                end_post_publish_time=''
            
            if '2023' in str(end_post_publish_time) or '2022-12' in str(end_post_publish_time):
            # if '2023' in str(end_post_publish_time):
                
                
                # print(page)
                
                for i in sj:
                    # post_publish_time=i.get("post_publish_time")
                    
                    
                    dic={}
                    dic['gb_id']=gb_id
                    dic['帖子时间']=i.get("post_publish_time")
                    post_title=i.get("post_title")
                    
                    if post_title:
                        
                        dic['帖子标题']=post_title
                    else:
                        dic['帖子标题']=i.get("post_content")
                    post_id=i.get("post_id")
                    dic['帖子URL']=f'http://guba.eastmoney.com/news,{gb_id},{post_id}.html'
                    dic['帖子作者']=i.get("post_user").get("user_nickname")
                    dic['浏览量']=i.get("post_click_count")
                    dic['评论量']=i.get("post_comment_count")
                    dic['点赞量']=i.get("post_like_count")
                    lst.append(dic)
                    print(gb_id,dic['帖子时间'])
                df=pd.DataFrame(lst)
                df.drop_duplicates(subset=['帖子URL'],keep='first',inplace=True)
                df.to_csv(str(gb_id)+'.csv',index=None,encoding='utf-8-sig')            

            else:
                break
            page=page+1
            print(page)
        except Exception as e:
            print(e)
        
            


            
basic_stock=pd.read_excel('股票代码和名字.xlsx',dtype='str')


for k in list(basic_stock['code']):
    
    craw(k)
          
import pandas as pd
ls = []
lst = pd.read_excel('股票代码和名字.xlsx',dtype='str')['code']

for i in lst:
    sj = pd.read_csv(i+'.csv')
    sj['年月'] = sj['帖子时间'].str[0:7]
    cond = sj['年月'] == '2022-12'
    sj2 = pd.DataFrame(sj[cond]['帖子标题'])
    sj2 = sj2.reset_index()
    del sj2['index']
    sj2.rename(columns={"帖子标题": i.split('.')[0]},inplace=True)
    ls.append(sj2)

comments = pd.concat(ls,axis = 1)

In [None]:
import pandas as pd
ls = []
lst = pd.read_excel('股票代码和名字.xlsx',dtype='str')['code']

for i in lst:
    sj = pd.read_csv(i+'.csv')
    sj['年月'] = sj['帖子时间'].str[0:7]
    cond = sj['年月'] == '2022-12'
    sj2 = pd.DataFrame(sj[cond]['帖子标题'])
    sj2 = sj2.reset_index()
    del sj2['index']
    sj2.rename(columns={"帖子标题": i.split('.')[0]},inplace=True)
    ls.append(sj2)

comments = pd.concat(ls,axis = 1)

In [60]:
import pandas as pd

test_data = pd.read_excel('数据汇总.xlsx')
test_data

Unnamed: 0,code,name,area,industry,market,list_date,eps,eps_yoy,bvps,roe,...,Volatility_15,Volatility_21,Volatility_30,Volatility_60,yield,label,sen_index,sen_score,sen_mean,sen_std
0,1,平安银行,深圳,银行,主板,19910403,1.78,27.14,18.32,10.15,...,-0.038197,0.095625,0.488579,-0.174041,0.020360,1,1793,-313,-0.174568,0.984645
1,2,万科A,深圳,全国地产,主板,19910129,1.47,2.35,,7.12,...,0.410888,1.492037,1.289951,1.546284,-0.011432,0,3098,-1160,-0.374435,0.927253
2,4,ST国华,深圳,软件服务,主板,19910114,-0.38,-281.88,,-6.17,...,0.586980,-0.475211,0.376238,0.779977,0.010571,1,416,-246,-0.591346,0.806418
3,5,ST星源,深圳,环境保护,主板,19901210,0.01,-94.82,,0.73,...,-0.803684,-0.234872,-0.404787,-0.545122,0.005435,1,535,-323,-0.603738,0.797183
4,6,深振业A,深圳,区域地产,主板,19920427,0.15,-64.21,,2.61,...,-0.518529,-0.496677,-0.280033,-0.623720,0.014901,1,10407,-7571,-0.727491,0.686117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,688789,宏华数科,浙江,专用机械,科创板,20210708,2.45,-7.89,,11.86,...,0.316911,0.199722,-0.147357,1.353955,-0.022152,0,49,41,0.836735,0.547608
2795,688798,艾为电子,上海,半导体,科创板,20210816,0.33,-78.29,,1.45,...,0.520592,0.800187,-0.276064,1.049499,0.009927,1,134,-2,-0.014925,0.999889
2796,688800,瑞可达,江苏,元器件,科创板,20210722,1.82,121.95,,18.24,...,0.853940,0.064290,0.946856,0.515999,0.041214,1,107,59,0.551402,0.834240
2797,688819,天能股份,浙江,电气设备,科创板,20210118,1.47,32.43,,11.14,...,-0.106737,0.730579,-0.059349,0.860177,0.009397,1,419,-179,-0.427208,0.904154
