In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
# from langchain.document_transformers import TextTransformer
from langchain.chains import create_extraction_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SequentialChain
from kor import create_extraction_chain, Object, Text, Number
import re
import openai
from openai import OpenAI

In [2]:
schema = Object(
    id="script",
    description="Extracting Financial data",
    attributes=[
        Text(
            id="经营活现金流量增减",
            description="经营活动产生的现金流量净额与上一年度相比的增减情况",
        ),
        Text(
            id="归属于母公司股东的净利润增减",
            description="归属于母公司股东的净利润与上一年度相比的增减情况",
        ),
        Text(
            id="营业收入",
            description="公司在本财年内实现的总收入",
        ),
        Text(
            id="非流动资产处置损益",
            description="公司处置非流动资产产生的损益",
        ),
        Text(
            id="稀释每股收益",
            description= "稀释每股收益",
        ),
    ],
    examples=[
        (
            '''
                经营活动: 230  28,840  25.3% 
                经营活动产生的现金流量净额  134,572  (192,733)  (16,161)  上年为负 
                每股比率（元/股）： 
                基本/稀释每股收益  2.20  1.73  1.40  27.2% 
                扣除非经常
                股东的净利润: 169,383  6.2% 
                归属于本行股东的净利润  45,516  36,336  25.3% 
                成本收入比  27.45%  28.30%  -0.85个百分点 
                平均总资产收益率  0.89%  0.77%  +0.12个百分点 
                加权平均净资
                营业收入: 22年  2021年  本年同比增减 
                营业收入  179,895  169,383  6.2% 
                归属于本行股东的净利润  45,516  36,336  25.3% 
                成本收入比  27.45%  28.30%  -0.85个百分点 
                平均总资
                资产处置损益: 入  131  105  24.8% 
                资产处置损益  180  12  1,400.0% 
                其他收益  215  225  (4.4%) 
                其他非利息净收入  19,557  15,985  22.3% 
                其他非利息净收入包括投资收益、公允价值变动损
                稀释每股收益: 年为负 
                每股比率（元/股）： 
                基本/稀释每股收益  2.20  1.73  1.40  27.2% 
                扣除非经常性损益后的基本/稀释每股收益  2.19  1.72  1.40  27.3% 
                每股经营活动产生的现金流量净额  6.93  (9.9''',
            {"经营活现金流量增减": "134,572  (192,733)  (16,161)", 
             "归属于母公司股东的净利润增减": "45,516  36,336  25.3% ",
             "营业收入": "179,895  169,383  6.2% ", 
             "非流动资产处置损益": "180  12  1,400.0% ", 
             "稀释每股收益": " 2.20  1.73  1.40  27.2% "},
        ),
        
        
    ],
    many=False,
)

In [None]:
llm = ChatOpenAI(
    model = "gpt-3.5-turbo-0125",
    openai_api_base="",
    openai_api_key="",
)

def getjson(results):
    extraction_chain = create_extraction_chain(llm, schema)
    output = extraction_chain.run(str(results))['data']
    return output['script'][0]

In [None]:
import re
import os
def find_context(text):
    matches = []
    results = {}
    patterns = ['经营活动', '股东的净利润', '营业收入', '资产处置损益', '稀释每股收益']
    digit_pattern = re.compile(r'\d') 
    for pattern in patterns:
        for match in re.finditer(pattern, text):
            start = max(0, match.start() - 20)
            end = min(len(text), match.end() + 100)
            context = text[start:end]
            if digit_pattern.search(context) and pattern not in results:
                results[pattern] = context
                break 
    return results

folder_path = '.\\2022年上市公司年报'
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        # 读取文件内容
        content = file.read()
        stock_code=file_path.split('\\')[-1].split('-')[0]
        year='2022'

        results = find_context(content)
        js=getjson(results)
        js['股票代码']=stock_code
        js['年份']=year
        print(js)

