<a href="https://colab.research.google.com/github/xxp-nlp/add_tag_algorithm/blob/main/2021_8_28.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re

# 正确文的分词处理，转化成chunk的格式 .phrase .trim_phrase  .dst   .tag
filename = '/content/CiWoTestdata-Correct.txt'

dependancy = re.compile(r'''(?:\*\s\d+\s) # キャプチャ対象外
             (-?\d+)      # 数字(係り先)
                      ''', re.VERBOSE)

class Morph:
    def __init__(self, line):
        cols = line.split(' ')
        self.surface = cols[0] # 表層形(surface) 

class Chunk_correct:
    def __init__(self, morphs, dst, id):
        self.morphs = morphs
        self.dst  = dst  # 係り先文節インデックス番号
        self.id = id
        self.phrase = ''.join([morph.surface for morph in morphs]) # 文節
        self.trim_phrase = self.phrase.replace('、','')
        self.tag = 'KEEP'

sentences = []
chunks = []
morphs = []
ids = []
dic_correct = {}
i = 0
with open(filename, mode='r') as f:
  for line in f:  # 1行ずつ読込
    dependancies = dependancy.match(line)
    fields = line.split(' ')


    if fields[0] == '#':
      id = fields[1]
      continue

    # EOSまたは係り受け解析結果でない場合
    if not (line == 'EOS\n' or dependancies):
      morphs.append(Morph(line))


    # EOSまたは係り受け解析結果で、形態素解析結果がある場合
    elif len(morphs) > 0:
      chunks.append(Chunk_correct(morphs, dst, id))
      morphs = []  

    # 係り受け結果の場合
    if dependancies:
      dst = int(dependancies.group(1))

    # EOSで係り受け結果がある場合
    if line == 'EOS\n' and len(chunks) > 0:
      sentences.append(chunks)
      chunks = []
      dic_correct[i] = [sentences,id]
      i+=1

for i, chunk in enumerate(sentences[9]):
    print('{}: {}  係り先:{} {}'.format(i, chunk.trim_phrase, chunk.dst, chunk.phrase))
    print(type(chunk.phrase),type(chunk.trim_phrase))

0: 同大統領の  係り先:1 同大統領の
<class 'str'> <class 'str'>
1: 首都脱出は  係り先:9 首都脱出は
<class 'str'> <class 'str'>
2: 先に  係り先:9 先に
<class 'str'> <class 'str'>
3: インタファクス通信が  係り先:9 インタファクス通信が
<class 'str'> <class 'str'>
4: 六日  係り先:9 六日、
<class 'str'> <class 'str'>
5: グロズヌイの  係り先:6 グロズヌイの
<class 'str'> <class 'str'>
6: 軍事筋の  係り先:7 軍事筋の
<class 'str'> <class 'str'>
7: 情報と  係り先:8 情報と
<class 'str'> <class 'str'>
8: して  係り先:9 して
<class 'str'> <class 'str'>
9: 伝えたが  係り先:13 伝えたが、
<class 'str'> <class 'str'>
10: ロシア政府が  係り先:12 ロシア政府が
<class 'str'> <class 'str'>
11: 公式に  係り先:12 公式に
<class 'str'> <class 'str'>
12: 認めたのは  係り先:13 認めたのは
<class 'str'> <class 'str'>
13: 初めて。  係り先:-1 初めて。
<class 'str'> <class 'str'>


In [None]:
# 错误文的分词处理，转化成chunk的格式 .phrase .trim_phrase  .dst   .tag

filename = '/content/CiWoTestdata.txt'

dependancy = re.compile(r'''(?:\*\s\d+\s) # キャプチャ対象外
             (-?\d+)      # 数字(係り先)
                      ''', re.VERBOSE)

class Morph:
    def __init__(self, line):
        cols = line.split(' ')
        self.surface = cols[0] # 表層形(surface)


class Chunk_wrong:
    def __init__(self, morphs, dst, id):
        self.morphs = morphs
        self.dst  = dst  # 係り先文節インデックス番号
        self.id = id
        self.phrase = ''.join([morph.surface for morph in morphs]) # 文節
        self.trim_phrase = self.phrase.replace('、','')
        self.tag = 'KEEP'

dic_wrong = {}
sentences_wrong = []
chunks = []
morphs = []
ids = []
i = 0
with open(filename, mode='r') as f:
  for line in f:  # 1行ずつ読込
    dependancies = dependancy.match(line)
    fields = line.split(' ')


    if fields[0] == '#':
      id = fields[1]
      continue

    # EOSまたは係り受け解析結果でない場合
    if not (line == 'EOS\n' or dependancies):
      morphs.append(Morph(line))


    # EOSまたは係り受け解析結果で、形態素解析結果がある場合
    elif len(morphs) > 0:
      chunks.append(Chunk_wrong(morphs, dst, id))
      morphs = []  

    # 係り受け結果の場合
    if dependancies:
      dst = int(dependancies.group(1))

    # EOSで係り受け結果がある場合
    if line == 'EOS\n' and len(chunks) > 0:
      sentences_wrong.append(chunks)
      chunks = []
      dic_wrong[i] = [sentences_wrong,id]
      i+=1


# for i, chunk in enumerate(sentences_wrong[2]):
#     print('{}: {}  係り先:{} {}'.format(i, chunk.trim_phrase, chunk.dst, chunk.tag))

In [None]:
# scale the data  对齐数据 总计530对数据
lenth = min(len(sentences),len(sentences_wrong))

print("読みやすい文：",len(sentences))
print("読みにくい文：",len(sentences_wrong))

correct = []
wrong = []

for i in range(len(sentences)):
  for j in range(len(sentences_wrong)):
    if dic_correct[i][1] == dic_wrong[j][1]:  
      correct.append(sentences[i])
      wrong.append(sentences_wrong[j])

読みやすい文： 530
読みにくい文： 546


In [None]:
# 根据正确文 改变 错误问的.tag  REDUCE  SWAP 
# 2.0

def algo(wrong,correct):
  dic2 = {}

  for i in range(lenth):

    for j in range(len(wrong[i])):
      if wrong[i][j].trim_phrase not in dic2:
        dic2[wrong[i][j].trim_phrase] = [] 
      dic2[wrong[i][j].trim_phrase].append(j)


    for j in range(len(correct[i])):
      if j == len(correct[i])-1:
        break

      for m in range(len(dic2[correct[i][j].trim_phrase])):
        for n in range(len(dic2[correct[i][j+1].trim_phrase])):
          if correct[i][j].dst == j+1 and dic2[correct[i][j].trim_phrase][m] == dic2[correct[i][j+1].trim_phrase][n] - 1:
            wrong[i][dic2[correct[i][j].trim_phrase][m]].tag = "REDUCE"
            correct[i][j].tag = "REDUCE"
            if correct[i][j].phrase[-1] == '、':
              wrong[i][dic2[correct[i][j].trim_phrase][m]].tag = "REDUCE_COMMA"
              correct[i][j].tag = "REDUCE_COMMA"

          if dic2[correct[i][j].trim_phrase][m] < len(wrong[i])-1 and dic2[correct[i][j].trim_phrase][m] == dic2[correct[i][j+1].trim_phrase][n] + 1:
            if wrong[i][dic2[correct[i][j].trim_phrase][m]].tag == 'KEEP':  
              wrong[i][dic2[correct[i][j].trim_phrase][m]].tag = "SWAP_B"
              wrong[i][dic2[correct[i][j+1].trim_phrase][n]].tag = "SWAP_F"
            else:
              continue

          if j + 3 < len(correct[i]) and dic2[correct[i][j].trim_phrase][m] - 2 >= 0:
            if wrong[i][dic2[correct[i][j].trim_phrase][m]].tag == 'KEEP' and wrong[i][dic2[correct[i][j].trim_phrase][m]-1].tag == 'KEEP' and wrong[i][dic2[correct[i][j].trim_phrase][m]-2].tag == 'KEEP':
              if correct[i][j].dst == correct[i][j+1].dst == correct[i][j+2].dst:
                if correct[i][j+1].trim_phrase == wrong[i][dic2[correct[i][j].trim_phrase][m]-2].trim_phrase and correct[i][j+2].trim_phrase == wrong[i][dic2[correct[i][j].trim_phrase][m]-1].trim_phrase:
                  wrong[i][dic2[correct[i][j].trim_phrase][m]-1].tag = "SWAP_F"
                  wrong[i][dic2[correct[i][j].trim_phrase][m]].tag = "SWAP_B"

          if j + 3 < len(correct[i]) and dic2[correct[i][j].trim_phrase][m] - 1 >= 0 and dic2[correct[i][j].trim_phrase][m] + 2 < len(correct[i]):
            if wrong[i][dic2[correct[i][j].trim_phrase][m]].tag == 'KEEP' and wrong[i][dic2[correct[i][j].trim_phrase][m]-1].tag == 'KEEP' and wrong[i][dic2[correct[i][j].trim_phrase][m]+1].tag == 'KEEP':
              if correct[i][j].dst == correct[i][j+1].dst == correct[i][j+2].dst:
                if correct[i][j+1].trim_phrase == wrong[i][dic2[correct[i][j].trim_phrase][m]+1].trim_phrase and correct[i][j+2].trim_phrase == wrong[i][dic2[correct[i][j].trim_phrase][m]-1].trim_phrase:
                  wrong[i][dic2[correct[i][j].trim_phrase][m]-1].tag = "SWAP_F"
                  wrong[i][dic2[correct[i][j].trim_phrase][m]].tag = "SWAP_B"
          


    dic2 = {}

In [None]:
# main
algo(wrong,correct) # step1_tag

In [None]:
# step1_tag wrong結果確認
for i in range(515,516):
  print("-------  sentence ", i+1, "-------")
  for j, chunk in enumerate(wrong[i]):
    print('{}: {}: {}'.format(j, wrong[i][j].trim_phrase, wrong[i][j].tag))

-------  sentence  516 -------
0: 工事を: KEEP
1: この: REDUCE
2: 土地の: REDUCE
3: 建物部分だけを: REDUCE
4: 残して: KEEP
5: 今年: REDUCE
6: 三月を: REDUCE
7: めどに: KEEP
8: 完成させる: REDUCE
9: ことに: KEEP
10: この: SWAP_F
11: ため: SWAP_B
12: している。: KEEP


In [None]:
# 修正重复词 无法对应 问题  例:この
wrong[41][3].tag = 'KEEP'

wrong[124][6].tag = 'REDUCE'
wrong[124][7].tag = 'KEEP'

wrong[515][10].tag = 'REDUCE'
wrong[515][11].tag = 'KEEP'

In [None]:
# step1_tag correct結果確認
for i in range(100):
  print("-------  sentence ", i+1, "-------")
  for j, chunk in enumerate(correct[i]):
    print('{}: {}: {}: {}: {}'.format(j, chunk.trim_phrase, chunk.tag, chunk.dst, chunk.phrase))

-------  sentence  1 -------
0: 同社では: KEEP: 5: 同社では
1: すでに: KEEP: 5: すでに
2: 準役員クラス以上に: KEEP: 5: 準役員クラス以上に
3: 能力重視型の: REDUCE: 4: 能力重視型の
4: 年俸制を: KEEP: 5: 年俸制を
5: 導入している。: KEEP: -1: 導入している。
-------  sentence  2 -------
0: 首相は: KEEP: 12: 首相は、
1: 今春の: REDUCE: 2: 今春の
2: 統一地方選後に: KEEP: 5: 統一地方選後に
3: 党全体での: REDUCE: 4: 党全体での
4: 新党移行を: KEEP: 5: 新党移行を
5: 目指す: REDUCE: 6: 目指す
6: 考えを: REDUCE: 7: 考えを
7: 強調: KEEP: 12: 強調、
8: 離党など: KEEP: 10: 離党など
9: 性急な: REDUCE: 10: 性急な
10: 行動への: REDUCE: 11: 行動への
11: 自重を: KEEP: 12: 自重を
12: 求めた。: KEEP: -1: 求めた。
-------  sentence  3 -------
0: しかし: KEEP: 10: しかし、
1: 山花氏は: KEEP: 5: 山花氏は
2: 二十日の: REDUCE: 3: 二十日の
3: 通常国会召集前の: REDUCE: 4: 通常国会召集前の
4: 新党結成を: KEEP: 5: 新党結成を
5: 主張して: KEEP: 7: 主張して
6: 物別れに: REDUCE: 7: 物別れに
7: 終わり: KEEP: 10: 終わり、
8: 同党の: REDUCE: 9: 同党の
9: 亀裂は: REDUCE: 10: 亀裂は
10: 決定的になった。: KEEP: -1: 決定的になった。
-------  sentence  4 -------
0: 山花氏は: KEEP: 6: 山花氏は
1: 予定通り: KEEP: 6: 予定通り
2: 九日: KEEP: 6: 九日、
3: 準備会参加議員の: KEEP: 5: 準備会参加議員の
4: 一回目の: REDUCE: 5: 一回目の
5: 集約を

In [None]:
# 正确文的结构体
class Chunk_temp:
    def __init__(self, phrase, trim_phrase, tag ,dst):
      self.phrase = phrase
      self.trim_phrase = trim_phrase
      self.tag = tag
      self.dst = dst

# 错误文的结构体
class Wrong_chunk_temp:
    def __init__(self, trim_phrase, tag):
      self.trim_phrase = trim_phrase
      self.tag = tag

# 正确文的结构体构造 [][][] -> chunk.trim_phrase  chunk.tag   chunk.dst
class New_chunk:
    def __init__(self, new_chunk):
      self.phrase = new_chunk[0]
      self.trim_phrase = new_chunk[1]
      self.tag = new_chunk[2]
      self.dst = new_chunk[3]

    def change(self):
      chunks = []
      for i in range(len(self.dst)):
        chunks.append(Chunk_temp(self.phrase[i], self.trim_phrase[i], self.tag[i], self.dst[i]))
      return chunks

# 错误文的结构体构造 [][][] -> chunk.trim_phrase  chunk.tag
class Wrong_new_chunk:
    def __init__(self, new_chunk):
      self.trim_phrase = new_chunk[0]
      self.tag = new_chunk[1]

    def change(self):
      chunks = []
      for i in range(len(self.trim_phrase)):
        chunks.append(Wrong_chunk_temp(self.trim_phrase[i], self.tag[i]))
      return chunks


# 正确文的tag_realize算法 得到 trim_phrase, tag, dst
class Merged:
  def __init__(self, j, phrase, trim_phrase, tag, dst):
    self.j = j
    self.phrase = phrase
    self.trim_phrase = trim_phrase
    self.tag = tag
    self.dst = dst

  def merge(self):
    new_phrase = []
    new_trim_phrase = []
    new_tag = []
    index = []
    set1 = set([])
    str1 = ''
    str2 = ''
    list_dst = []
    lag = []

    for i in range(self.j):
      if self.tag[i] == 'REDUCE':
        set1.add(i)
        set1.add(i+1)

      if self.tag[i] == 'REDUCE_COMMA':
        set1.add(i)
        set1.add(i+1)
        lag.append(i)

      if self.tag[i] == 'KEEP':
        temp_list = list(set1)
        temp_list.sort()
        if len(temp_list)>0:
          list_dst.append(self.dst[temp_list[-1]])
        for i in temp_list:
          if i in lag:
            str1 += self.phrase[i]
            str1 += '、'
            str2 += self.trim_phrase[i]
            str2 += '、'
          else:
            str1 += self.phrase[i]
            str2 += self.trim_phrase[i]

        if len(str2)>0:
          new_phrase.append(str1)
          new_trim_phrase.append(str2)
          index.append(list(set1))
          new_tag.append("KEEP")
          str1 = ''
          str2 = ''
          set1.clear()
        else: 
          new_phrase.append(self.phrase[i])
          new_trim_phrase.append(self.trim_phrase[i])
          new_tag.append("KEEP")
          index.append(i)
          list_dst.append(self.dst[i])      

      if self.tag[i] == 'SWAP_F':
        new_phrase.append(self.phrase[i+1])
        new_trim_phrase.append(self.trim_phrase[i+1])
        new_tag.append("KEEP")
        index.append(i+1)
      if self.tag[i] == 'SWAP_B':
        new_phrase.append(self.phrase[i-1])
        new_trim_phrase.append(self.trim_phrase[i-1])
        new_tag.append("KEEP")
        index.append(i-1)

    new_dst = []
    for m in range(len(list_dst)):
      for n in range(len(index)):
        if isinstance(index[n],int):
          if list_dst[m] == index[n]:
            new_dst.append(n)

        else:
          for k in range(len(index[n])):
            if list_dst[m] == index[n][k]:
              new_dst.append(n)
    new_dst.append(-1)

    return new_phrase,new_trim_phrase,new_tag,new_dst

# 错误文的tag_realize算法 得到 trim_phrase, tag
class Wrong_merged:
  def __init__(self, j, trim_phrase, tag):
    self.j = j
    self.trim_phrase = trim_phrase
    self.tag = tag

  def merge(self):
    new_trim_phrase = []
    new_tag = []
    index = []
    set1 = set([])
    str1 = ''
    lag = []

    for i in range(self.j):
      if self.tag[i] == 'REDUCE':
        set1.add(i)
        set1.add(i+1)

      if self.tag[i] == 'REDUCE_COMMA':
        set1.add(i)
        set1.add(i+1)
        lag.append(i)

      if self.tag[i] == 'KEEP':
        temp_list = list(set1)
        temp_list.sort()
        for i in temp_list:
          if i in lag:
            str1 += self.trim_phrase[i]
            str1 += '、'
          else:
            str1 += self.trim_phrase[i]
        if len(str1)>0:
          new_trim_phrase.append(str1)
          index.append(list(set1))
          new_tag.append("KEEP")
          str1 = ''
          set1.clear()
        else: 
          new_trim_phrase.append(self.trim_phrase[i])
          new_tag.append("KEEP")
          index.append(i)
          
      if self.tag[i] == 'SWAP_F':
        new_trim_phrase.append(self.trim_phrase[i+1])
        new_tag.append("KEEP")
        index.append(i+1)
      if self.tag[i] == 'SWAP_B':
        new_trim_phrase.append(self.trim_phrase[i-1])
        new_tag.append("KEEP")
        index.append(i-1)

    return new_trim_phrase,new_tag

In [None]:
lag = []
tag = ['REDUCE','REDUCE_COMMA','REDUCE','KEEP','REDUCE_COMMA']
text = ['REDUCE1','REDUCE_COMMA1','REDUCE1','KEEP1','REDUCE_COMMA1']
list1 = [1,2,3,4]
str1 = ''
for i in range(len(tag)):
  if tag[i] ==  'REDUCE_COMMA':
    lag.append(i)
print(lag)
for i in list1:
  if i in lag:
    str1 += text[i]
    str1 += '、'
  else:
    str1 += text[i]
print(str1)

[1, 4]
REDUCE_COMMA1、REDUCE1KEEP1REDUCE_COMMA1、


In [None]:
# 重构结构 chunk.trim_phrase  chunk.tag   chunk.dst  ->  [][][] 并调用算法
def tag_realize_algo(wrong,correct):
  phrase = []
  trim_phrase = []
  tag = []
  dst = []
  new_chunks = []
  correct_new_chunks = []

  for i in range(lenth):
    for j in range(len(wrong[i])):
      trim_phrase.append(wrong[i][j].trim_phrase)
      tag.append(wrong[i][j].tag)

    new_chunk = Wrong_merged(len(wrong[i]), trim_phrase, tag).merge()
    trim_phrase = []
    tag = []
    temp_chunk = Wrong_new_chunk(new_chunk).change()
    new_chunks.append(temp_chunk)

    for j in range(len(correct[i])):
      phrase.append(correct[i][j].phrase)
      trim_phrase.append(correct[i][j].trim_phrase)
      
      tag.append(correct[i][j].tag)
      dst.append(correct[i][j].dst)
    
    correct_new_chunk = Merged(len(correct[i]), phrase, trim_phrase, tag, dst).merge()
    phrase = []
    trim_phrase = []
    tag = []
    dst = []
    temp_correct_chunk = New_chunk(correct_new_chunk).change()
    correct_new_chunks.append(temp_correct_chunk)
  
  return new_chunks,correct_new_chunks

In [None]:
# main step1_realize
(new_chunks,correct_new_chunks) = tag_realize_algo(wrong,correct)

In [None]:
# step1_realize wrong結果確認
for i in range(41,42):
  print('------------sentence:',i+1,'----------------')
  for j in range(len(new_chunks[i])):
    print(j, new_chunks[i][j].trim_phrase, new_chunks[i][j].tag)

------------sentence: 42 ----------------
0 常任理事国入りによって KEEP
1 そのことが KEEP
2 困難になることが想定されるならば KEEP
3 常任理事国入りは KEEP
4 慎重にすべきだと思う。 KEEP


In [None]:
# step1_realize correct結果確認
for i in range(41,42):
  print('------------sentence:',i+1,'----------------')
  for j in range(len(correct_new_chunks[i])):
    print(j, correct_new_chunks[i][j].trim_phrase, correct_new_chunks[i][j].tag, correct_new_chunks[i][j].dst)

------------sentence: 42 ----------------
0 そのことが KEEP 2
1 常任理事国入りによって KEEP 2
2 困難になることが想定されるならば KEEP 4
3 常任理事国入りは KEEP 4
4 慎重にすべきだと思う。 KEEP -1


In [None]:
# tag and realize algorithm
for i in range(20):
  algo(new_chunks,correct_new_chunks)
  (new_chunks,correct_new_chunks) = tag_realize_algo(new_chunks,correct_new_chunks)

In [None]:
# wrong結果確認  # 530对数据中349对成功   490
count = 0
for i in range(lenth):
  print('------------sentence:',i+1,'----------------')
  for j in range(len(new_chunks[i])):
    print(j, new_chunks[i][j].trim_phrase, new_chunks[i][j].tag)
  if len(new_chunks[i]) == 1:
    count += 1

------------sentence: 1 ----------------
0 同社ではすでに準役員クラス以上に能力重視型の年俸制を導入している。 KEEP
------------sentence: 2 ----------------
0 首相は、今春の統一地方選後に党全体での新党移行を目指す考えを強調、離党など性急な行動への自重を求めた。 KEEP
------------sentence: 3 ----------------
0 しかし、山花氏は二十日の通常国会召集前の新党結成を主張して物別れに終わり、同党の亀裂は決定的になった。 KEEP
------------sentence: 4 ----------------
0 山花氏は予定通り九日、準備会参加議員の一回目の集約を行う。 KEEP
------------sentence: 5 ----------------
0 しかし、準備会参加はただちに離党を意味せず、実際の離党者数が焦点になる。 KEEP
------------sentence: 6 ----------------
0 妥協案として統一地方選後に市民団体代表も交えた新党準備会を発足させる考えを示したが、歩み寄りはなかった。 KEEP
------------sentence: 7 ----------------
0 また、山花氏は通常国会で新党による新国会内会派を旗揚げしても村山政権を支える姿勢を表明。 KEEP
------------sentence: 8 ----------------
0 しかし、首相は山花氏らの行動は倒閣につながるとの考えを強調した。 KEEP
------------sentence: 9 ----------------
0 さらに首相は、訪米中に政権基盤を揺るがすような行動を取らないようくぎを刺した。 KEEP
------------sentence: 10 ----------------
0 同大統領の首都脱出は先にインタファクス通信が六日、グロズヌイの軍事筋の情報として伝えたが、ロシア政府が公式に認めたのは初めて。 KEEP
------------sentence: 11 ----------------
0 しかし、チェチェン側からこれを確認する情報は伝えられていない。 KEE

In [None]:
count

490