In [13]:
import xml.etree.ElementTree as ET
import json
import os

def parse_WS(ws):
    res = {}
    AJLB = ''
    WSZL = ''
    JBFY = ''
    for child in ws:
        attrib = child.attrib
        res[attrib['nameCN']] = attrib['value']
        if child.tag == 'AJLB':
            AJLB = attrib['value']
        elif child.tag == 'WSZL':
            WSZL = attrib['value']
        elif child.tag in ['JBFY', 'CBJG']:
            JBFY = attrib['value']
    return res, AJLB, WSZL, JBFY


def parse_WW(ww):
    FGCY = []
    for child in ww:
        attrib = child.attrib
        if child.tag == 'JCZZ':
            for sub_child in child:
                if sub_child.tag == 'R_JCRY':
                    for sub_sub_child in sub_child:
                        if sub_sub_child.tag == 'XM':
                            FGCY.append(sub_sub_child.attrib['value'])
        elif child.tag == 'SPZZCY':
            for sub_child in child:
                if sub_child.tag == 'SPRYXM':
                    FGCY.append(sub_child.attrib['value'])
        elif child.tag == 'CUS_FGCY':
            for sub_child in child:
                if sub_child.tag == 'FGRYXM':
                    FGCY.append(sub_child.attrib['value'])
    return FGCY


def parse_SSJL(ssjl):
    for child in ssjl:
        if child.tag in ['AY', 'QSZAY']:
            try:
                return child.attrib['value']
            except:
                pass
    return ''


def parse_CUS_AY(cus_ay):
    for child in cus_ay:
        if child.tag == 'CUS_AY':
            try:
                return child.attrib['oValue']
            except:
                return ''


def parse_xml(path):
    json_file = {}
    try:
        root = ET.parse(path).getroot()
    except:
        return None
    AJLB, WSZL, JBFY, FGCY, AY = '', '', '', [], ''
    QW_text = ''
    for child in root:
        if child.tag == 'QW':
            QW = {}
            QW_text = child.attrib['value']
            for sub_child in child:
                if sub_child.tag == 'WS':
                    res, AJLB, WSZL, JBFY = parse_WS(sub_child)
                    QW[sub_child.attrib['nameCN']] = res
                elif sub_child.tag == 'WW':
                    FGCY = parse_WW(sub_child)
                    try:
                        QW[sub_child.attrib['nameCN']] = sub_child.attrib['value']
                    except:
                        continue
                elif sub_child.tag == 'SSJL':
                    AY = parse_SSJL(sub_child)
                    try:
                        QW[sub_child.attrib['nameCN']] = sub_child.attrib['value']
                    except:
                        continue
                elif sub_child.tag == 'CUS_SJX':
                    continue
                else:
                    if sub_child.attrib.get('value', None) is None:
                        print(f'in file: {path}, QW->{sub_child.tag} does not have key `value`')
                        continue
                    QW[sub_child.attrib['nameCN']] = sub_child.attrib['value']
            json_file[child.attrib['nameCN']] = QW
        elif child.tag == 'FTRY':
            continue
        elif child.tag == 'FT':
            tmp = json_file.get('法条', [])
            tmp.append(child.attrib['value'])
            json_file['法条'] = tmp
        elif child.tag == 'CUS_FJD':
            tmp = parse_CUS_AY(child)
            if tmp != '':
                AY = tmp
        else:
            try:
                json_file[child.attrib['nameCN']] = child.attrib['value']
            except:
                continue
    json_file['案例属性'] = {
        "案件类别": AJLB,
        "文书种类": WSZL,
        "经办法院": JBFY,
        "法官成员": FGCY,
        "案由": AY
    }
    return json_file, QW_text


In [None]:
file_path = '/Legal_data/'
file_names = os.listdir(file_path)
for file_name in file_names:
    json_file = parse_xml(file_path+file_name)
    if json_file['案例属性']['文书种类'] == '起诉书':
        save_path = file_name.replace('.xml', '.json')
        with open(save_path, 'w') as f:
            json.dump(json_file, f, ensure_ascii=False)
        print(f'起诉书案例: {file_name}')
        break

Connect with CA

In [10]:
from elasticsearch import Elasticsearch

# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = "6nV9Iaz6k97tZt2Ftd4J"

# Create the client instance
es = Elasticsearch(
    "https://localhost:9200",
    ca_certs="./elasticsearch-8.2.0/config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD),
)

# Successful response!
es.info()
# {'name': 'instance-0000000000', 'cluster_name': ...}

ObjectApiResponse({'name': 'LAPTOP-0DFL83JT', 'cluster_name': 'elasticsearch', 'cluster_uuid': '38IaAKHRTOq5kRIgJnu01Q', 'version': {'number': '8.2.0', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': 'b174af62e8dd9f4ac4d25875e9381ffe2b9282c5', 'build_date': '2022-04-20T10:35:10.180408517Z', 'build_snapshot': False, 'lucene_version': '9.1.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

Connect without CA

In [3]:
from elasticsearch import Elasticsearch
es = Elasticsearch(
    "http://localhost:9200",
)

# Successful response!
es.info()

ObjectApiResponse({'name': 'LAPTOP-0DFL83JT', 'cluster_name': 'elasticsearch', 'cluster_uuid': '38IaAKHRTOq5kRIgJnu01Q', 'version': {'number': '8.2.0', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': 'b174af62e8dd9f4ac4d25875e9381ffe2b9282c5', 'build_date': '2022-04-20T10:35:10.180408517Z', 'build_snapshot': False, 'lucene_version': '9.1.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
resp = es.indices.delete(index="test-index")
print(resp)

{'acknowledged': True}


In [7]:
settings = {
    "analysis": {
        "analyzer": {
            "default": {
                "type": "ik_max_word",
                "tokenizer": "ik_max_word",
            },
            "default_search": {
                "type": "ik_smart",
                "tokenizer": "ik_smart",
            }
        }
    }
}
mappings = {
    "properties": {
      "vector": {
        "type": "dense_vector",
        "dims": 10,
        "index": True,
        "similarity": "l2_norm"
      }
    }
}
res = es.indices.create(index="test-index", settings=settings, mappings=mappings)
print(res)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'test-index'}


In [None]:
resp = es.indices.get(index="test-index")
print(resp)

In [None]:
resp = es.indices.analyze(index="test-index", text="知识产权")
print(resp)

In [None]:
import json
import os

cur_paths = os.listdir('./')
file_paths = list(filter(lambda x:x[-5:]==".json", cur_paths))

for file_path in file_paths:
    print(f"index {file_path}")
    with open(file_path) as f:
        doc = json.load(f)
    resp = es.index(index="test-index", document=doc)
    print(resp)

In [1]:
with open('./stop_words', 'r', encoding='utf-8') as f:
    stop_words = f.read().split('\n')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
corpus = [
    '犯罪嫌疑人是一个罪犯',
    '这是一个有趣的灵魂',
    '文本相似度',
    '如何阅读文献找到文献之间的相似度',
]

for idx in range(len(corpus)):
    text = corpus[idx]
    text = ' '.join(jieba.lcut(text))
    text = ' '.join(list(filter(lambda x:x not in stop_words, text.split(' '))))
    corpus[idx] = text
print(corpus)

# print(jieba.lcut('如何阅读文献找到文献之间的相似度', cut_all=False))

In [20]:

doc = {
    "全文": {
        "法院": "人民法院",
        "法官": "某某人",
        "案由": "未知",
        "案情": {
            "初始": "杀人未遂这是一段文本",
            "结果": "人没了",
            "法律": "知识产权，这是一个严肃的问题"
        }
    },
    "法条": ["劳动法", "知识产权法", "未成年人保护法"],
    "自定义": "暂时没有"
}
resp = es.index(index="test-index", document=doc)
print(resp)

{'_index': 'test-index', '_id': 'i5KVH4EBG8dmy1dH7VVO', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}


In [44]:
import traceback
def update_key(key_list, val, dic):
    key = key_list[0]
    try:
        v = dic[key]
        if isinstance(v, dict) and len(key_list) > 1:
            key_list.remove(key)
            update_key(key_list, val, v)
        elif isinstance(v, list):
            target = val.replace("<em>", "")
            target = target.replace("</em>", "")
            for idx in range(len(v)):
                if v[idx] == target:
                    dic[key][idx] = val
                    break
        elif isinstance(v, str):
            dic[key] = val
    except:
        traceback.print_stack()

In [3]:
resp = es.search(index="test-index", 
                query={
                    "query_string": {
                        "query": "",
                }},
                fields=[
                    "自定义*",
                    "全文*"
                    "法条",
                ],
                highlight={
                    "fields": {
                        "自定义*": {},
                        "全文*": {},
                        "法条": {}
                    }
                },
        )
print("Got %d Hits:" % resp['hits']['total']['value'])
for hit in resp['hits']['hits']:
    print(hit)
    source = hit['_source']
    highlight = hit['highlight']
    for (key, val) in highlight.items():
        update_key(key.split('.'), val[0], source)
    print(source)
    break

Got 0 Hits:


In [46]:
doc = {
    "全文": {
        "法院": "人民法院",
        "法官": "某某人",
        "案由": "未知",
        "案情": {
            "初始": "杀人未遂这是一段文本",
            "结果": "人没了",
            "法律": "知识产权，这是一个严肃的问题"
        }
    },
    "法条": ["劳动法", "知识产权法", "未成年人保护法"],
    "自定义": "暂时没有"
}
update_key("全文.法院.keyword".split("."), "人民<em>法院</em>", doc)
print(doc)
"全文.法院.keyword".split(".")

{'全文': {'法院': '人民<em>法院</em>', '法官': '某某人', '案由': '未知', '案情': {'初始': '杀人未遂这是一段文本', '结果': '人没了', '法律': '知识产权，这是一个严肃的问题'}}, '法条': ['劳动法', '知识产权法', '未成年人保护法'], '自定义': '暂时没有'}


['全文', '法院', 'keyword']

In [24]:
dic = {
    "a": {
        "a": 0,
        "b": 1
    },
    "b": 0
}
t = dic["a"]
t["a"] = 1
dic

{'a': {'a': 1, 'b': 1}, 'b': 0}

In [26]:
l = [0,1,2,3]
l.remove(0)
l

[1, 2, 3]

In [28]:

text = "<em>知识产权</em><em>文字</em>"
text = text.replace("<em>", "")
text = text.replace("</em>", "")
text

'知识产权文字'

Below is for project

In [None]:
resp = es.indices.get(index="case-data")
print(resp)

In [2]:
resp = es.indices.analyze(index="case-data", text="（2017）渝0237民初1622号",
                          analyzer="ik_smart",)
print(resp)

{'tokens': [{'token': '2017', 'start_offset': 1, 'end_offset': 5, 'type': 'ARABIC', 'position': 0}, {'token': '渝', 'start_offset': 6, 'end_offset': 7, 'type': 'CN_CHAR', 'position': 1}, {'token': '0237', 'start_offset': 7, 'end_offset': 11, 'type': 'ARABIC', 'position': 2}, {'token': '民初', 'start_offset': 11, 'end_offset': 13, 'type': 'CN_WORD', 'position': 3}, {'token': '1622号', 'start_offset': 13, 'end_offset': 18, 'type': 'TYPE_CQUAN', 'position': 4}]}


Load data

In [14]:
INDEX = "new-case-data"

In [15]:
settings = {
    "analysis": {
        "analyzer": {
            "default": {
                "type": "ik_max_word",
                "tokenizer": "ik_max_word",
            },
            "default_search": {
                "type": "ik_smart",
                "tokenizer": "ik_smart",
            }
        }
    }
}
mappings = {
    "properties": {
        "vec": {
            "type": "dense_vector",
            "dims": 100,
            "index": True,
            "similarity": "l2_norm"
        }
    }
}
res = es.indices.create(index=INDEX, settings=settings, mappings=mappings)
print(res)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'new-case-data'}


删库操作！！谨慎！！

In [49]:
resp = es.indices.delete(index="case-data")
print(resp)

{'acknowledged': True}


In [17]:
from gensim.models import KeyedVectors
from collections import OrderedDict

tc_wv_model = KeyedVectors.load_word2vec_format('./search-engine/backend/data/embedding.txt', binary=False)

with open('./search-engine/backend/data/stop_words', 'r', encoding='utf-8') as f:
    stop_words = f.read().split('\n')

In [22]:
a_set = set([1,2,3])
print(4 in a_set)

False


In [None]:
import os
import numpy as np
import jieba

START_IDX = 0
stop_words_set = set(stop_words)

file_paths = os.listdir('./Legal_data/')
for idx in range(START_IDX, len(file_paths)):
    file_path = file_paths[idx]
    # print(f"index {file_path}", end=" ")
    try:
        doc, text = parse_xml('./Legal_data/'+file_path)
    except:
        continue
    if doc is not None:
        text = list(filter(lambda x:x not in stop_words_set, jieba.lcut(text)))
        vec = np.zeros(100)
        for word in text:
            try:
                vec += np.array(tc_wv_model[word])
            except:
                continue
        vec /= len(text)
        doc['vec'] = vec.tolist()
        resp = es.index(index=INDEX, document=doc, id=idx)
    # print(resp)
    print(f"{idx}", end=" ")

In [22]:
file_paths[28664]

'49489.xml'

In [None]:
resp = es.search(index="case-data", 
                from_=0,
                query={
                    "query_string": {
                        "query": "刑法",
                }},
                fields=[
                    "自定义*",
                    "全文*"
                    "法条",
                ],
                highlight={
                    "fields": {
                        "自定义*": {},
                        "全文*": {},
                        "法条": {}
                    }
                }
        )

print("Got %d Hits:" % resp['hits']['total']['value'])
print(len(resp['hits']['hits']))
# for hit in resp['hits']['hits']:
#     print(hit)
highlight = resp['hits']['hits'][0]['highlight']
print('before filter')
print(highlight)
val_set = set()
unique_highlight = dict()
for (key, val) in highlight.items():
    v = val[0] if len(val) == 1 else ''.join(val)
    if v not in val_set:
        val_set.add(v)
        unique_highlight[key] = val
print('after filter')
print(unique_highlight)
# print(resp)

In [13]:
def deduplicate_FGCY(resp):
    try:
        for idx in range(len(resp['hits']['hits'])):
            resp['hits']['hits'][idx]['_source']['案例属性']['法官成员'] = list(set(resp['hits']['hits'][idx]['_source']['案例属性']['法官成员']))
    except:
        pass

In [14]:
resp = es.search(index="new-case-data", 
                from_=0,
                query={
                    # "query_string": {
                    #     "query": "刑法",}
                    "match_all": {}
                },
                fields=[
                    "自定义*",
                    "全文*"
                    "法条",
                ],
                highlight={
                    "fields": {
                        "自定义*": {},
                        "全文*": {},
                        "法条": {}
                    }
                },
                post_filter={
                    "bool": {
                        "must": [
                            # {"term": {"法条": "知识产权法"}},
                            # {"match": {"全文.文首.审判程序": "一审案件 二审案件"}},
                            {"query_string": {"fields": ["全文.文首.审判程序"], 
                                                "query": "一审案件 OR 二审案件",
                                                }},
                            # {"match": {"全文.文首.经办法院": {
                            #     "query": "贵州省黔南布依族苗族自治州中级人民法院",
                            #     "operator": "and"
                            # }}},
                        ]
                    }
                }
        )

print("Got %d Hits:" % resp['hits']['total']['value'])
deduplicate_FGCY(resp)
print(len(resp['hits']['hits']))
for hit in resp['hits']['hits']:
    # print(hit['_source'])
    print(hit['_source']['案例属性']['法官成员'])
    # print(list(set(hit['_source']['案例属性']['法官成员'])))

Got 10000 Hits:
10
['许天瑶', '甘震']
['许天瑶', '甘震']
['程宇', '李阳', '李军', '张欣']
['李军', '张欣', '程宇', '李阳']
['覃若鹏', '李源', '刘萌', '高翔宇']
['高翔宇', '李源', '刘萌', '覃若鹏']
['白海叶', '陈智慧', '宋任国', '韩卫祥']
['白海叶', '韩卫祥', '宋任国', '陈智慧']
['叶青', '张文清', '蔡芬芳', '杨月斌']
['叶青', '杨月斌', '张文清', '蔡芬芳']
['李留霞', '王慧玲', '孙升', '程登健']
['李留霞', '王慧玲', '孙升', '程登健']
['栗莎', '胡伯韬', '侯宝柱', '朱万君']
['栗莎', '胡伯韬', '侯宝柱', '朱万君']
['沈冰洁', '赵魁', '张敏', '瞿静']
['沈冰洁', '赵魁', '张敏', '瞿静']
['高志强', '陈鸿妍', '鲁恢', '于桂莲']
['高志强', '于桂莲', '鲁恢', '陈鸿妍']
['陈旻尔', '吕倩茜', '黄良飞', '王孜力哈']
['陈旻尔', '吕倩茜', '黄良飞', '王孜力哈']


In [None]:
import numpy as np
import jieba

sentence = '死者没有受伤吧'
sentence = list(filter(lambda x:x not in stop_words, jieba.lcut(sentence)))
vec = np.zeros(100)
for word in sentence:
    try:
        vec += np.array(tc_wv_model[word])
    except:
        continue
vec /= len(sentence)
# print(vec)

In [None]:
resp = es.knn_search(index="case-data", 
                knn={
                    "field": "vec",
                    "query_vector": np.zeros(100),
                    "k": 50,
                    "num_candidates": 200
                }
        )
print("Got %d Hits:" % resp['hits']['total']['value'])
print(len(resp['hits']['hits']))
# for hit in resp['hits']['hits']:
#     print(hit)
print(resp['hits']['hits'][0])
# print(resp)

In [15]:
resp = es.search(index="new-case-data", 
                from_=0,
                query={
                    "query_string": {
                            "fields": ["法条"],
                            "query": " OR ".join(["《中华人民共和国民法通则》第九十二条", 
                                            "《中华人民共和国民事诉讼法》第六十四条第一款"]),
                            "default_operator": "AND"
                        }
                },
                highlight={
                    "fields": {
                        "法条": {},
                    }
                }
        )
print(resp)

{'took': 55, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 7491, 'relation': 'eq'}, 'max_score': 11.83526, 'hits': [{'_index': 'new-case-data', '_id': '23302', '_score': 11.83526, '_ignored': ['自定义_裁判文书_证据段.keyword', '自定义_裁判文书_查明事实段.keyword', '自定义_裁判文书_裁判分析过程.keyword', '全文.裁判分析过程.keyword', '自定义_裁判文书_被告辩称段.keyword', '自定义_裁判文书_全文.keyword', '全文.案件基本情况.keyword', '自定义_裁判文书_原告诉称段.keyword'], '_source': {'全文': {'文首': {'文书制作单位': '法院', '法院文书种类': '裁判文书', '经办法院': '山东省潍坊滨海经济技术开发区人民法院', '文书名称': '民事判决书', '案号': '（2017）鲁0792民初173号', '案件类别': '民事案件', '文书种类': '判决书', '审判程序': '一审案件', '案件类型': '民事一审案件'}, '当事人': '原告：尹希平，男，1973年5月23日出生，汉族，住潍坊市寒亭区。 委托诉讼代理人：李涛，山东潍州律师事务所律师。 被告：王树先，男，1971年10月6日出生，汉族，现住潍坊滨海经济技术开发区。 委托诉讼代理人：孙志丽，山东翔华律师事务所律师。', '诉讼记录': ' 原告尹希平与被告王树先不当得利纠纷一案，本院于2017年4月12日立案后，依法适用简易程序，公开开庭进行了审理。原告尹希平委托诉讼代理人李涛、被告王树先委托诉讼代理人孙志丽到庭参加诉讼。本案现已审理终结。', '案件基本情况': ' 原告尹希平向本院提出诉讼请求：1、依法判令被告返还原告不当得利款141000元及利息（自判决确认债权之日起按照同期银行利率4倍计算）；2、本案诉讼费用由被告承担。

In [None]:
resp = es.search(index="case-data", 
                from_=0,
                query={
                    "match": {
                        "案例属性.法官成员": "沈静 刘颖 沈静",
                }},
                highlight={
                    "fields": {
                        "案例属性.法官成员": {},
                    }
                }
        )
print(resp)

In [None]:
resp = es.search(index="case-data", 
                from_=0,
                size=20,
                query={
                    "match": {
                        "案例属性.经办法院": "安徽省合肥市中级人民法院",
                }},
                highlight={
                    "fields": {
                        "案例属性.经办法院": {},
                    }
                }
        )
print("Got %d Hits:" % resp['hits']['total']['value'])
print(len(resp['hits']['hits']))
print(resp['hits']['hits'])

In [None]:
resp = es.get(index="case-data", 
                id=1
        )
print(resp)
data = {
    'data': resp['_source']
}
print(data)

In [42]:
a = [1,2,3,4,5,]
a[:10]

[1, 2, 3, 4, 5]

In [None]:
import os
import json
import jieba
from tqdm import tqdm
import xml.etree.ElementTree as ET

file_paths = os.listdir('./Legal_data/')

with open('./stop_words', 'r', encoding='utf-8') as f:
    stop_words = f.read().split('\n')

corpus = []
for idx in tqdm(range(len(file_paths))):
    file_path = file_paths[idx]
    root = ET.parse(f'./Legal_data/{file_path}').getroot()
    data = root.find('QW').attrib['value']
    # print(data)
    data = list(filter(lambda x:x not in stop_words, jieba.lcut(data)))
    # print(data)
    corpus.append(data)
    # break

with open('./corpus.json', 'w', encoding='utf-8') as f:
    json.dump(corpus, f, ensure_ascii=False)

In [12]:
" OR ".join(["一审案件"])


'一审案件'