In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import os
# os.chdir('/content/drive/MyDrive/MANAGER_implementation/code/Text_preprocessing/')

In [None]:
# !pip install neo4j



In [None]:
# 라이브러리 import
from neo4j import GraphDatabase
import re

# Neo4j 연결 정보 (너의 환경에 맞게 수정)
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ## 
# Neo4j 연결 드라이버 생성
driver = GraphDatabase.driver(URI, auth=AUTH)

# Neo4j에서 entity 목록을 한번만 가져와 메모리에 캐싱하는 함수
def load_all_entities_from_neo4j():
    entities_set = set()
    with driver.session() as session:
        query = "MATCH (e:Entity) RETURN DISTINCT e.name as name"
        results = session.run(query)
        for record in results:
            entities_set.add(record["name"].lower())
    return entities_set

# 캐싱된 entity 목록
cached_entities = load_all_entities_from_neo4j()

# 입력 텍스트에서 FinDKG의 entity만 효율적으로 식별하는 함수
def identify_entities_in_text(T, cached_entities):
    identified_entities = set()
    T_lower = T.lower() #  입력 텍스트를 소문자로 변환

    for entity in sorted(cached_entities, key=lambda x: -len(x)):
        pattern = r'\b' + re.escape(entity.lower()) + r'\b'
        if re.search(pattern, T_lower):
            identified_entities.add(entity)

    return identified_entities

# 식별된 entity를 바탕으로 Neo4j에서 외부지식 N(e)를 추출하는 함수
def extract_external_knowledge(T, cached_entities, start_time=None, end_time=None):
    # time format :
    entities = identify_entities_in_text(T, cached_entities)

    knowledge = {}



    with driver.session() as session:
        for entity in entities:

            query = """
            MATCH (e:Entity {name: $entity})-[r]->(neighbor)
            WHERE 1=1
            """
            if start_time is not None:
                query += " AND r.time > $start_time"

            if end_time is not None:
                query += " AND r.time < $end_time"
            # RETURN 절
            query += """
            RETURN r.relation as relation, r.time as time, neighbor.name as neighbor_entity
            """
            results = session.run(query, entity=entity , start_time = start_time , end_time = end_time)

            knowledge[entity] = []

            for record in results:
                knowledge[entity].append({
                    "relation": record["relation"],
                    "neighbor_entity": record["neighbor_entity"]                 
                    #,"time": record["time"]
                })

    return knowledge

In [15]:
cached_entities

{'libya',
 'u.s. employment data',
 't. rowe price group inc.',
 'u.s.-taliban talks',
 'digital transformation',
 'us presidential election',
 'william ackman',
 'tempe',
 'firms',
 'us stocks',
 'united states currency',
 'direct cash payments',
 'nbc',
 'border',
 'covid-19 outbreak',
 'mr. johnson',
 'jobless claims data',
 'solicitor general noel francisco',
 'ukraine policy',
 'decision',
 'mark mccormick',
 'u.s. tax law',
 'climate goals',
 'education system',
 't. rowe price associates inc.',
 'covid-19 treatment',
 'federal housing administration',
 'john j. ray',
 'uzbekistan',
 'energy information administration',
 'new england patriots',
 'phil flynn',
 'katie roof',
 'pan am flight 103',
 'condolences',
 'talks with u.s.',
 'energy shortages',
 'jim paulsen',
 'electricity market',
 'debt auctions',
 'coronavirus spread',
 'knesset',
 'immigration system',
 'bessemer venture partners',
 'oil revenues',
 'legal strategy',
 'mark begich',
 'afghan interpreters',
 'tightenin

### 테스트

In [3]:
T_example = """
    President Trump Administration had an influence on the Volcker rule.
    apple is a company that produces consumer electronics. stock price of apple is 1000 dollars.
    """
external_knowledge = extract_external_knowledge(T_example, cached_entities,'2018-05-01','2018-06-03')
print("Extracted Knowledge:", external_knowledge)

Extracted Knowledge: {'influence': [], 'consumer': [{'relation': 'control', 'neighbor_entity': 'interest rates'}, {'relation': 'relate_to', 'neighbor_entity': 'economic conditions'}, {'relation': 'participates_in', 'neighbor_entity': 'global markets'}, {'relation': 'raise', 'neighbor_entity': 'spending'}, {'relation': 'impact', 'neighbor_entity': 'wage gains'}], 'president trump administration': [{'relation': 'raise', 'neighbor_entity': 'china'}, {'relation': 'operate_in', 'neighbor_entity': 'us government'}, {'relation': 'control', 'neighbor_entity': 'north korea'}, {'relation': 'control', 'neighbor_entity': 'qualcomm inc.'}, {'relation': 'relate_to', 'neighbor_entity': 'ford motor co.'}, {'relation': 'introduce', 'neighbor_entity': 'tariffs'}, {'relation': 'control', 'neighbor_entity': 'zte corp.'}, {'relation': 'control', 'neighbor_entity': 'north american free trade agreement'}, {'relation': 'is_member_of', 'neighbor_entity': 'north korean leader kim jong un'}, {'relation': 'contro

In [9]:
for k,v in external_knowledge.items():
    if v:
        print(f"Entity: {k}")
        print(v)

Entity: consumer
[{'relation': 'control', 'neighbor_entity': 'interest rates'}, {'relation': 'relate_to', 'neighbor_entity': 'economic conditions'}, {'relation': 'participates_in', 'neighbor_entity': 'global markets'}, {'relation': 'raise', 'neighbor_entity': 'spending'}, {'relation': 'impact', 'neighbor_entity': 'wage gains'}]
Entity: president trump administration
[{'relation': 'raise', 'neighbor_entity': 'china'}, {'relation': 'operate_in', 'neighbor_entity': 'us government'}, {'relation': 'control', 'neighbor_entity': 'north korea'}, {'relation': 'control', 'neighbor_entity': 'qualcomm inc.'}, {'relation': 'relate_to', 'neighbor_entity': 'ford motor co.'}, {'relation': 'introduce', 'neighbor_entity': 'tariffs'}, {'relation': 'control', 'neighbor_entity': 'zte corp.'}, {'relation': 'control', 'neighbor_entity': 'north american free trade agreement'}, {'relation': 'is_member_of', 'neighbor_entity': 'north korean leader kim jong un'}, {'relation': 'control', 'neighbor_entity': 'ice'},

In [None]:
# relation, neighbor entity를 문자열로 이어붙이는 함수
def knowledge_to_text(knowledge: dict) -> str:
    # 예: "Inflation impact Stock Market. Fed affect S&P 500"
    segments = []
    for rel_list in knowledge.values():
        for item in rel_list:
            r = item["relation"]
            nbr = item["neighbor_entity"]
            segment = f"{r}/{nbr}"
            segments.append(segment)
    return "/".join(segments)

In [None]:
knowledge_seq = knowledge_to_text(external_knowledge)
knowledge_seq.split('/') # 리스트 형태로 저장

# chatglm

In [None]:
# 의존성 라이브러리 설치
!pip install sentence-transformers==2.2.2
!pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate

Collecting sentence-transformers==2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.6.0->sentence-transformers==2.2.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.6.0->sentence-transformers==2.2.2)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.6.0->sentence-transformers==2.2.2)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
model_name = "THUDM/chatglm2-6b"  # 예시
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
chatglm_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

tokenization_chatglm.py:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

configuration_chatglm.py:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- configuration_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_chatglm.py:   0%|          | 0.00/54.9k [00:00<?, ?B/s]

quantization.py:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- quantization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- modeling_chatglm.py
- quantization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

pytorch_model-00001-of-00007.bin:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

pytorch_model-00002-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

pytorch_model-00003-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00004-of-00007.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

pytorch_model-00005-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

pytorch_model-00006-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00007-of-00007.bin:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
chatglm_model # 모델구조

GLMTransformer(
  (layers): ModuleList(
    (0-27): 28 x GLMBlock(
      (input_layernorm): RMSNorm()
      (self_attention): SelfAttention(
        (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
        (core_attention): CoreAttention(
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (dense): Linear(in_features=4096, out_features=4096, bias=False)
      )
      (post_attention_layernorm): RMSNorm()
      (mlp): MLP(
        (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
        (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
      )
    )
  )
  (final_layernorm): RMSNorm()
)

### H = E(Xt)에서, Xt는 서브워드 단위가 아님 따라서 밑에 방법이 아닌, 먼저 nltk로 각 단어 구분 후( 토큰화) E로 임베딩 하는 방법을 사용해야 합니다.
### 밑의 방법은 실패함

In [None]:
text = T_example
encoded = tokenizer(text, return_tensors='pt')
input_ids = encoded["input_ids"][0]   # shape: (seq_len,)

print("input_ids:", input_ids)
# -> tensor([ 101, 7592, 2088,  102])

# ID -> Token 문자열
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("tokens:", tokens)
# -> ["[CLS]", "hello", "world", "[SEP]"]

input_ids: tensor([64790, 64792, 30910,    13,   296, 24277,  3586,  7745,   599,   284,
         5036,   331,   267,  3967, 27392,  4606, 30930,    13,   296, 30959,
         7021, 30516,  1645, 30930,   629, 18618,   466, 30930, 30937, 30930,
         5468, 12080,  5250, 30930,  1270,  1656,    13,   296])
tokens: ['', '', '▁', '<0x0A>', '▁▁▁▁', 'President', '▁Trump', '▁Administration', '▁had', '▁an', '▁influence', '▁on', '▁the', '▁Vol', 'cker', '▁rule', '.', '<0x0A>', '▁▁▁▁', 'W', 'ells', '▁Fargo', '▁Co', '.', '▁also', '▁impacted', '▁U', '.', 'S', '.', '▁Federal', '▁Reserve', '▁policies', '.', '▁+', '▁Ne', '<0x0A>', '▁▁▁▁']
