行情数据临时处理，将 JSON 数据转换为 yaml 格式，案例内部默认数据配置

In [None]:
import difflib
import jieba
import numpy as np
from scipy.spatial.distance import cosine

# 考虑字符串长度不一致的新算法
def calculate_similarity(str1, str2):
    # 统一转换为小写
    str1, str2 = str1.lower(), str2.lower()
    
    # 基本滑动窗口匹配
    max_char_ratio = 0
    str1_len = len(str1)
    
    for i in range(len(str2) - str1_len + 1):
        window = str2[i:i + str1_len]
        ratio = difflib.SequenceMatcher(None, str1, window).ratio()
        max_char_ratio = max(max_char_ratio, ratio)
    
    # 使用jieba分词进行语义匹配
    words1 = list(jieba.cut(str1))
    words2 = list(jieba.cut(str2))
    
    # 创建词汇表和向量
    vocab = list(set(words1) | set(words2))
    vector1 = np.zeros(len(vocab))
    vector2 = np.zeros(len(vocab))
    
    # 统计词频
    for i, word in enumerate(vocab):
        vector1[i] = words1.count(word)
        vector2[i] = words2.count(word)
    
    # 计算语义相似度
    semantic_similarity = 1 - cosine(vector1, vector2) if np.sum(vector1) > 0 and np.sum(vector2) > 0 else 0
    
    # 综合考虑字符匹配和语义匹配
    combined_similarity = 0.6 * max_char_ratio + 0.4 * semantic_similarity
    
    return combined_similarity

local_str = '維天運通 LOGORY'
network_str = '合肥維天運通信息科技股份有限公司'
loc = 2

# 计算相似度
print('滑动窗口最佳匹配相似度:', calculate_similarity(local_str, network_str))

# 对比不同方法的结果
print('原始完整字符比较:', difflib.SequenceMatcher(None, local_str, network_str).ratio())
print('定位串字符相似度:', difflib.SequenceMatcher(None, network_str[loc:len(local_str)+loc].lower(), local_str.lower()).ratio())

滑动窗口最佳匹配相似度: 0.36363636363636365
原始完整字符比较: 0.2962962962962963
定位串字符相似度: 0.36363636363636365


In [10]:
import jieba
import numpy as np
from scipy.spatial.distance import cosine

# 定义两个字符串
str1 = "維天運通 LOGORY"
str2 = "合肥維天運通信息科技股份有限公司"

# 统一处理字符串，英文转换为小写
str1 = str1.lower()
str2 = str2.lower()

# 找到第二个字符串中第一个字符的位置
index = str2.find(str1[0])

# 如果找到了，截取第二个字符串
if index != -1:
    str2 = str2[index:index+len(str1)]

# 使用 jieba 库进行分词
words1 = list(jieba.cut(str1))
words2 = list(jieba.cut(str2))

# 创建词汇表
vocab = list(set(words1) | set(words2))

# 初始化词向量
vector1 = np.zeros(len(vocab))
vector2 = np.zeros(len(vocab))

# 统计词频
for i, word in enumerate(vocab):
    vector1[i] = words1.count(word)
    vector2[i] = words2.count(word)

# 计算两个字符串的相似度（以余弦距离作为指标）
similarity = 1 - cosine(vector1, vector2)

print("两个字符串的相似度是：", similarity)

bstr = 'xxx'
print(bstr)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\core\AppData\Local\Temp\jieba.cache
Loading model cost 0.610 seconds.
Prefix dict has been built successfully.


两个字符串的相似度是： 0.44721359549995787
xxx


In [12]:
# 普通时间格式，转换为 cron 格式
import datetime

init_time = datetime.datetime.strptime("08:50", "%H:%M")
close_price = datetime.datetime.strptime("13:58", "%H:%M")

init_time_cron = f"{init_time.minute} {init_time.hour} * * 1-5"
close_price_cron = f"{close_price.minute} {close_price.hour} * * 1-5"

print("Init Time Cron:", init_time_cron)
print("Close Price Cron:", close_price_cron)

Init Time Cron: 50 8 * * 1-5
Close Price Cron: 58 13 * * 1-5


In [2]:
import pymysql

# connect mysql and select data
def connect_mysql(sql):
    conn = pymysql.connect(host='192.168.19.146', user='root', port=5306, passwd='sErjf&eNh9zv4CSV', db='ykcz_trade', charset='utf8')
    cur = conn.cursor()
    cur.execute(sql)
    # 是否为空
    if cur.rowcount == 0:
        print('No data')
        return None

    data = cur.fetchall()
    # 获取字段名
    field = [i[0] for i in cur.description]
    # 将字段名和数据组合成字典
    data_dict = [dict(zip(field, i)) for i in data]

    cur.close()
    conn.close()

    return data_dict

# compare dict
def compare_dict(dict1, dict2):
    for key in dict1:
        if dict1[key] != dict2[key]:
            # 打印 右对齐
            print('{0:20} {1:20} {2:20}'.format(key, dict1[key], dict2[key]))


if __name__ == '__main__':
    # qas 数据
    data_153 = connect_mysql("select * from ykcz_trade.tsecu_product where product_code = '00152'")
    # pubber 数据
    data_152 = connect_mysql("select * from ykcz_trade.tsecu_product where product_code = '00153'")

    # 其中一个为空
    if data_153 is None or data_152 is None:
        print('No data')
        exit()
        
    compare_dict(data_153[0], data_152[0])

product_code         00152                00153               
product_name         SHENZHEN INT'L       TENCENT             
product_name_tc      深圳國際                 騰訊控股                
product_name_sc      深圳国际                 腾讯控股                
product_long_name    SHENZHEN INT'L       TENCENT             
product_long_name_tc 深圳國際                 騰訊控股                
product_long_name_sc 深圳国际                 腾讯控股                
close_price                      7.580000             0.104000
close_price_prev                 6.500000             8.600000
isin                 BMG8086V1467         KYG875721634        
lot_size                       500.000000           100.000000
listing_date                     19720925             20040616
remarks              close_price [1]                          
last_upd_user        QUOTASVR_STATISTICS  QUOTASVR_PUBBER_STATISTICS
last_upd_dt          20 20
operlog_guid         d734a005f8d540d4b30962648bfda581 efe5f535aea44a89b09f42a6c6a9f4e