# Overview

- feedback-prize-2021 コンペ。
- 文分類がどれくらい精度出せるか調査。

In [1]:
# Directories
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/feedback-prize-2021/sample_submission.csv
/kaggle/input/feedback-prize-2021/train.csv
/kaggle/input/feedback-prize-2021/test/0FB0700DAF44.txt
/kaggle/input/feedback-prize-2021/test/D72CB1C11673.txt
/kaggle/input/feedback-prize-2021/test/18409261F5C2.txt
/kaggle/input/feedback-prize-2021/test/DF920E0A7337.txt
/kaggle/input/feedback-prize-2021/test/D46BCB48440A.txt
/kaggle/input/feedback-prize-2021/train/62C57C524CD2.txt
/kaggle/input/feedback-prize-2021/train/80667AD3FFD8.txt
/kaggle/input/feedback-prize-2021/train/21868C40B94F.txt
/kaggle/input/feedback-prize-2021/train/87A6EF3113C6.txt
/kaggle/input/feedback-prize-2021/train/24687D08CFDA.txt
/kaggle/input/feedback-prize-2021/train/AB7706113077.txt
/kaggle/input/feedback-prize-2021/train/DC50F75D327F.txt
/kaggle/input/feedback-prize-2021/train/A860936AC6CD.txt
/kaggle/input/feedback-prize-2021/train/539B8000A428.txt
/kaggle/input/feedback-prize-2021/train/FF71B53EC31F.txt
/kaggle/input/feedback-prize-2021/train/146440EF28

In [2]:
# Parameters
DEBUG_FLAG = False
VERSION = 'nb06'

SUBMISSION_PATH = '/kaggle/input/feedback-prize-2021/sample_submission.csv'
TRAIN_PATH = '/kaggle/input/feedback-prize-2021/train.csv'
TRAIN_DIR = '/kaggle/input/feedback-prize-2021/train'
TEST_DIR = '/kaggle/input/feedback-prize-2021/test'

N_SPLITS = 5 if not DEBUG_FLAG else 2
TEXT_MIN_LENGTH = 4

In [3]:
# Modules
import re
import sys
import time

import datatable as dt
import nltk
import numpy as np
import pandas as pd
import torch
import transformers
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from matplotlib_venn import venn3
from termcolor import colored
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [4]:
# My functions
def read_df():
    train = dt.fread(TRAIN_PATH).to_pandas()
    train['discourse_id'] = train['discourse_id'].astype(int)
    train['discourse_start'] = train['discourse_start'].astype('int16')
    train['discourse_end'] = train['discourse_end'].astype('int16')
    submission = dt.fread(SUBMISSION_PATH).to_pandas()
    
    print(f'train shape: {train.shape}')
    print(f'submission shape: {submission.shape}')

    train_files = os.listdir(TRAIN_DIR)
    train_files.sort()
    print(f'number of train_files: {len(train_files)}')

    test_files = os.listdir(TEST_DIR)
    test_files.sort()
    print(f'number of test_files: {len(test_files)}')

    dic_train = {}
    for train_file in train_files:
        with open(os.path.join(TRAIN_DIR, train_file), 'r') as f:
            texts = f.read()
        dic_train[train_file.replace('.txt', '')] = texts

    dic_test = {}
    for test_file in test_files:
        with open(os.path.join(TEST_DIR, test_file), 'r') as f:
            texts = f.read()
        dic_test[test_file.replace('.txt', '')] = texts

    return train, submission, dic_train, dic_test


def color_text(id):
    '''
    ref. https://www.kaggle.com/ilialar/feedback-prize-simple-eda
    '''
    color_scheme = {
        'Lead': 'green',
        'Position': 'red',
        'Claim': 'blue',
        'Counterclaim': 'magenta',
        'Rebuttal': 'yellow',
        'Evidence': 'cyan',
        'Concluding Statement': 'grey'
    } 
    
    annot_df = train[train['id'] == id]
    text = dic_train[id]
    
    blocks = [(int(row['discourse_start']), int(row['discourse_end']), color_scheme[row['discourse_type']]) for k, row in annot_df.iterrows()]
    blocks.sort()
    i = 0
    last_symbol = -1
    while i < len(blocks):
        if blocks[i][0] > last_symbol + 1:
            blocks.insert(i, (last_symbol+1, blocks[i][0] - 1, None))
        last_symbol = blocks[i][1]
        i += 1
    if last_symbol < len(text):
        blocks.append((last_symbol+1, len(text) - 1, None))

    colored_text = ''.join([colored(text[x[0]:x[1]+1], x[2]) for x in blocks])
    return colored_text


def text_cleaning(text):
    '''
    ref) # https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

In [5]:
train, submission, dic_train, dic_test = read_df()

train shape: (144293, 8)
submission shape: (5, 3)
number of train_files: 15594
number of test_files: 5


In [6]:
train

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524,8,229,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,230,312,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,313,401,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,402,758,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,759,886,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1618153340639,2234,3203,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1618153383399,3221,4509,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1618024996127,4510,4570,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1618025268756,4570,4922,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [7]:
submission

Unnamed: 0,id,class,predictionstring
0,18409261F5C2,b'',b''
1,D46BCB48440A,b'',b''
2,0FB0700DAF44,b'',b''
3,D72CB1C11673,b'',b''
4,DF920E0A7337,b'',b''


train のテキストファイルを文で区切り、教師データのラベルを付与する。  
これにより、文分類でどこまで精度を出せるか調査する。

In [8]:
def _get_elements(text_id):
    df = train.query('id == @text_id')
    elements = df[['discourse_start', 'discourse_end', 'discourse_type', 'predictionstring']].sort_values('discourse_start')
    return elements.to_records(index=False).tolist()


def _fill_gaps(text_id):
    elements = _get_elements(text_id)
    start_idx = 0
    final_idx = len(dic_train[text_id])
    new_elements = []
    
    # エッセイの最初の discourse が登録されていないなら、ラベル No Class として登録する
    if elements[0][0] != start_idx and elements[0][0] - 0 > TEXT_MIN_LENGTH:
        predictionstring = len(dic_train[text_id][start_idx:elements[0][0]].split())
        predictionstring = ' '.join(str(i) for i in range(0, predictionstring))
        new_element = (0, elements[0][0], 'No Class', predictionstring)
        new_elements.append(new_element)
    
    # エッセイの最後の discourse が登録されていないなら、ラベル No Class として登録する
    if elements[-1][1] != final_idx and final_idx - elements[-1][1] > TEXT_MIN_LENGTH:
        predictionstring = len(dic_train[text_id][elements[-1][1]:final_idx].split())
        predictionstring_last_end = int(elements[-1][3].split()[-1])+1
        predictionstring = ' '.join(str(i) for i in range(predictionstring_last_end, predictionstring_last_end+predictionstring))
        new_element = (elements[-1][1], final_idx, 'No Class', predictionstring)
        new_elements.append(new_element)
    
    elements += new_elements
    elements = sorted(elements, key=lambda x: x[0])
    
    # エッセイの途中で discourse が登録されていないなら、ラベル No Class として登録する
    new_elements = []
    start_idx = elements[0][0]
    end_idx = elements[0][1]
    
    for i, element in enumerate(elements[1:-1]):
        if end_idx != element[0] and element[0] - end_idx > TEXT_MIN_LENGTH:
            predictionstring_last_end = int(elements[i][3].split()[-1])+1
            predictionstring_next_start = int(element[3].split()[0])
            predictionstring = ' '.join(str(i) for i in range(predictionstring_last_end, predictionstring_next_start))
            new_element = (end_idx, element[0], 'No Class', predictionstring)
            new_elements.append(new_element)
        start_idx = element[0]
        end_idx = element[1]
            
    elements += new_elements
    elements = sorted(elements, key=lambda x: x[0])
    
    return elements
        

def get_sentences(text_id):
    sentences = []
    text = dic_train[text_id]
    elements = _fill_gaps(text_id)
    
    for element in elements:
        sentences.append([text_id, text[element[0]: element[1]], element[2], element[3]])
    
    return pd.DataFrame(sentences, columns=['id', 'discourse_text', 'discourse_type', 'predictionstring'])

In [9]:
id2class = dict(enumerate(train['discourse_type'].unique().tolist() + ['No Class']))
class2id = {v: k for k, v in id2class.items()}

In [10]:
%%time

# train file size: 15594 なので約2分かかる
dfs_sentences = []
for text_id in dic_train.keys():
    df = get_sentences(text_id)
    dfs_sentences.append(df)
df_sentences = pd.concat(dfs_sentences).reset_index(drop=True)

df_sentences['label'] = df_sentences['discourse_type'].map(class2id)
df_sentences

CPU times: user 2min 8s, sys: 2.28 s, total: 2min 10s
Wall time: 2min 9s


Unnamed: 0,id,discourse_text,discourse_type,predictionstring,label
0,0000D23A521A,"Some people belive that the so called ""face"" o...",Position,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,1
1,0000D23A521A,"It was not created by aliens, and there is no ...",Evidence,34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 4...,2
2,0000D23A521A,"A mesa is a naturally occuring rock formation,...",Evidence,69 70 71 72 73 74 75 76 77 78 79 80 81 82 83,2
3,0000D23A521A,"This ""face"" on mars only looks like a face bec...",Claim,84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 9...,3
4,0000D23A521A,Many conspiracy theorists believe that NASA is...,Counterclaim,117 118 119 120 121 122 123 124 125 126 127 12...,5
...,...,...,...,...,...
173095,FFFD0AF13501,you might be able to look at the pretty things...,Evidence,116 117 118 119 120 121 122 123 124 125 126 12...,2
173096,FFFD0AF13501,"\nIn conclusion,",No Class,191 192,7
173097,FFFD0AF13501,all i'm saying is that the seagoing cowboys wo...,Position,193 194 195 196 197 198 199 200 201 202 203 20...,1
173098,FFFD0AF13501,You can go so many places and you rarely go to...,Concluding Statement,209 210 211 212 213 214 215 216 217 218 219 22...,4


In [11]:
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score


def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        class_score = score_feedback_comp_micro(pred_subset, gt_subset)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

In [12]:
# 適当な id をひとつ選ぶ
selected_id = '423A1CA112E2'

# 適当な id に対応する train を作成
selected_gt = df_sentences.query('id == @selected_id')
display(selected_gt.head(3))

# 適当な id に対応するエッセイを sentence に分割
def split_text(text_id):
    # ファイル読み込み
    with open(os.path.join(TRAIN_DIR, f'{text_id}'+'.txt'), 'r') as f:
        text = f.read()
    sentences = nltk.sent_tokenize(text)
    
    # 単語に分解して番号付け
    lst_id_sentence = []
    word_id = 0
    for sentence in sentences:
        predictionstring = ' '.join(str(i) for i in range(word_id, word_id+len(sentence.split())))
        id_sentence = [text_id, sentence, predictionstring]
        lst_id_sentence.append(id_sentence)
        word_id += len(sentence.split())
    df = pd.DataFrame(lst_id_sentence, columns=['id', 'discourse_text', 'predictionstring'])
    
    return df

selected_sentences = split_text(selected_id)
display(selected_sentences.head(3))

# すべての単語に番号をつけて、train の discourse_text と sentence の類似度に基づいて sentence にラベルづけ
print(len(selected_gt), len(selected_sentences))
merged = selected_sentences.merge(selected_gt, on='id', how='outer', suffixes=('_pred', '_gt'))
display(merged.head(3))

jaccard = lambda x: len(set(x[0].split()).intersection(set(x[1].split()))) / len(set(x[0].split()).union(set(x[1].split())))
merged['jaccard'] = merged[['predictionstring_pred', 'predictionstring_gt']].apply(jaccard, axis=1)
merged_groupby_pred = merged.groupby('discourse_text_pred')
merged = merged.loc[merged_groupby_pred['jaccard'].idxmax(), :]
merged = merged.sort_index()
pred = merged[['id', 'discourse_text_pred', 'predictionstring_pred', 'discourse_type', 'jaccard']]
pred = pred.rename(columns={'predictionstring_pred': 'predictionstring', 'discourse_type': 'class'})
pred.head(3)

# 選択した id で評価
score_feedback_comp(pred[pred['class'] != 'No Class'], train.query('id == @selected_id'))

Unnamed: 0,id,discourse_text,discourse_type,predictionstring,label
44622,423A1CA112E2,Phones\n\n,No Class,0,7
44623,423A1CA112E2,Modern humans today are always on their phone....,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,0
44624,423A1CA112E2,They are some really bad consequences when stu...,Position,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,1


Unnamed: 0,id,discourse_text,predictionstring
0,423A1CA112E2,Phones\n\nModern humans today are always on th...,0 1 2 3 4 5 6 7 8
1,423A1CA112E2,They are always on their phone more than 5 hou...,9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24...
2,423A1CA112E2,They even do it while driving.,39 40 41 42 43 44


11 27


Unnamed: 0,id,discourse_text_pred,predictionstring_pred,discourse_text_gt,discourse_type,predictionstring_gt,label
0,423A1CA112E2,Phones\n\nModern humans today are always on th...,0 1 2 3 4 5 6 7 8,Phones\n\n,No Class,0,7
1,423A1CA112E2,Phones\n\nModern humans today are always on th...,0 1 2 3 4 5 6 7 8,Modern humans today are always on their phone....,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,0
2,423A1CA112E2,Phones\n\nModern humans today are always on th...,0 1 2 3 4 5 6 7 8,They are some really bad consequences when stu...,Position,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,1


0.5181818181818182

In [13]:
%%time
# すべての id で評価する

# すべての id に対応するエッセイを sentence に分割
def split_text(text_id):
    # ファイル読み込み
    with open(os.path.join(TRAIN_DIR, f'{text_id}'+'.txt'), 'r') as f:
        text = f.read()
    sentences = nltk.sent_tokenize(text)
    
    # 単語に分解して番号付け
    lst_id_sentence = []
    word_id = 0
    for sentence in sentences:
        predictionstring = ' '.join(str(i) for i in range(word_id, word_id+len(sentence.split())))
        id_sentence = [text_id, sentence, predictionstring]
        lst_id_sentence.append(id_sentence)
        word_id += len(sentence.split())
    df = pd.DataFrame(lst_id_sentence, columns=['id', 'discourse_text', 'predictionstring'])
    
    return df


lst_selected_sentences = []
for selected_id in train['id'].unique():
    selected_sentences = split_text(selected_id)
    lst_selected_sentences.append(selected_sentences)
selected_sentences = pd.concat(lst_selected_sentences)

# すべての単語に番号をつけて、train の discourse_text と sentence の類似度に基づいて sentence にラベルづけ
merged = selected_sentences.merge(df_sentences, on='id', how='outer', suffixes=('_pred', '_gt'))

# 各 id で類似度計算
jaccard = lambda x: len(set(x[0].split()).intersection(set(x[1].split()))) / len(set(x[0].split()).union(set(x[1].split())))
merged['jaccard'] = merged[['predictionstring_pred', 'predictionstring_gt']].apply(jaccard, axis=1)
merged_groupby_pred = merged.groupby('discourse_text_pred')
merged = merged.loc[merged_groupby_pred['jaccard'].idxmax(), :]
merged = merged.sort_index()

# すべての id で class と predictionstring を抽出
pred = merged[['id', 'discourse_text_pred', 'predictionstring_pred', 'discourse_type', 'jaccard']]
pred = pred.rename(columns={'predictionstring_pred': 'predictionstring', 'discourse_type': 'class'})

display(pred.head(3))

# 評価
print(score_feedback_comp(pred[pred['class'] != 'No Class'], train))

Unnamed: 0,id,discourse_text_pred,predictionstring,class,jaccard
1,423A1CA112E2,Phones\n\nModern humans today are always on th...,0 1 2 3 4 5 6 7 8,Lead,0.177778
12,423A1CA112E2,They are always on their phone more than 5 hou...,9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24...,Lead,0.681818
23,423A1CA112E2,They even do it while driving.,39 40 41 42 43 44,Lead,0.136364


0.4924182734462578
CPU times: user 3min 38s, sys: 3.76 s, total: 3min 42s
Wall time: 3min 48s


In [14]:
pred.to_csv('pred.csv', index=False)
df_sentences.to_csv('df_sentences.csv', index=False)

In [15]:
# discourse_type の次の discourse_type をクロス集計
df_sentences['next_discourse_type'] = df_sentences.groupby('id')['discourse_type'].transform(lambda x: x.shift(-1))
df_sentences_crosstab = pd.crosstab(df_sentences['discourse_type'], df_sentences['next_discourse_type'], normalize='index', dropna=False)
df_sentences_crosstab.style.background_gradient(axis=1)

next_discourse_type,Claim,Concluding Statement,Counterclaim,Evidence,Lead,No Class,Position,Rebuttal
discourse_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Claim,0.223285,0.022227,0.020909,0.570924,0.0,0.153969,0.008647,4e-05
Concluding Statement,0.019375,0.000969,0.024946,0.007508,0.0,0.882296,0.064907,0.0
Counterclaim,0.035,0.028448,0.011897,0.227759,0.0,0.054138,0.011897,0.630862
Evidence,0.401105,0.22164,0.075939,0.097731,6.7e-05,0.174341,0.01828,0.010896
Lead,0.093733,0.0,0.016446,0.044717,0.000107,0.093841,0.751048,0.000107
No Class,0.485333,0.030821,0.01946,0.256982,0.069162,0.0,0.131053,0.007189
Position,0.487302,0.035501,0.022337,0.179896,0.000199,0.274432,0.000133,0.000199
Rebuttal,0.16788,0.24325,0.054473,0.426626,0.0,0.090162,0.015027,0.002583


In [16]:
# 予測に対しておなじことをやる
pred['next_class'] = pred.groupby('id')['class'].transform(lambda x: x.shift(-1))
pred_crosstab = pd.crosstab(pred['class'], pred['next_class'], normalize='index', dropna=False)
pred_crosstab.style.background_gradient(axis=1)

next_class,Claim,Concluding Statement,Counterclaim,Evidence,Lead,No Class,Position,Rebuttal
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Claim,0.251518,0.024575,0.019286,0.637537,0.001969,0.051713,0.010082,0.003321
Concluding Statement,0.002489,0.936264,0.002657,0.001278,6.7e-05,0.048063,0.008375,0.000807
Counterclaim,0.054365,0.042546,0.2526,0.253388,0.000788,0.020328,0.007564,0.368421
Evidence,0.114142,0.060051,0.018155,0.768087,0.000851,0.027568,0.005844,0.005301
Lead,0.078129,0.008218,0.005312,0.021461,0.661463,0.006962,0.217808,0.000646
No Class,0.120601,0.029679,0.015311,0.394529,0.033887,0.378637,0.023873,0.003483
Position,0.482075,0.037987,0.02163,0.214367,0.000467,0.072234,0.167568,0.003672
Rebuttal,0.111237,0.17402,0.032805,0.276207,0.002262,0.035445,0.011501,0.356523


【結果と考察】
- Claim の次のクラスは、多い順に Evidence, Claim, No Class。これは正しい。
- Concluding Statement の次のクラスは、多い順に Concluding Statement, No Class, Position。これはおかしい。正しくは No Class, Position, Counterclaim。
- Counterclaim の次のクラスは、多い順に Rebuttal, Evidence, Counterclaim。これはおかしい。本来は Rebuttal, Ecidence, No Class。
- Evidence の次のクラスは、多い順に Evidence, Claim, Concluding Statement。これはおかしい。本来は Claim, Concluding Statement, No Class。
- Lead の次のクラスは、多い順に Lead, Position, Claim。これはおかしい。正しくは Position, No Class, Claim。
- No Class の次のクラスは、多い順に Evidence, No Class, Claim。これはおかしい。正しくは Claim, Evidence, Position。
- Position の次のクラスは、多い順に Claim, Evidence, Position。これはおかしい。正しくは Claim, No Class, Evidence。
- Rebuttal の次のクラスは、多い順に Rebuttal, Evidence, Concluding Statement。これはおかしい。正しくは Evidence, Concluding Statement, Claim。

【対策】
- Concluding Statement の直後に Concluding Statement ならひとつにつなげる。
- Counterclaim の直後に Counterclaim ならひとつにつなげる。
- **Claim 以外は、同じラベルが続いていたらつなげるのがよさそう。**
- Lead の次は Position または No Class, Claim に限定する。
- No Class の次は 圧倒的に Claim だが、予測では Evidence 多め。これはどうしようもないのでは？
- **基本、次ラベルの確率が低い場合、どう処理するのが妥当なのか検討の余地がある。**

TODO
- 同じクラスをつなげる
- ひとつの文が分割されているのはどのようなときか（接続表現が関係している？）
- 複数の文がつながっているのはどのようなときか

In [17]:
pred_connect = pred.copy()

# next_predictionstring 列と next_discourse_text を作成
pred_connect['next_predictionstring'] = pred_connect.groupby('id')['predictionstring'].transform(lambda x: x.shift(-1))
pred_connect['next_discourse_text_pred'] = pred_connect.groupby('id')['discourse_text_pred'].transform(lambda x: x.shift(-1))

# next_clas が同一の行に結合フラグを立てる
pred_connect['match'] = (pred_connect['class'] == pred_connect['next_class']) & (pred_connect['class'] != 'Claim')

# next_class が同一の行の次の行に消去フラグを立てる
pred_connect['delete'] = ((pred_connect['class'] == pred_connect['next_class']) & (pred_connect['class'] != 'Claim')).shift(1).fillna(False)

# 結合フラグが立つ行に対して、next_predictionstring と next_discourse_text を結合
f_match = lambda x: x[1]+' '+x[2] if x[0] else x[1]
pred_connect['predictionstring'] = pred_connect[['match', 'predictionstring', 'next_predictionstring']].apply(f_match, axis=1)

# 消去フラグが立つ行に対して、行削除
pred_connect = pred_connect.drop(pred_connect.index[pred_connect['delete']])

pred_connect.head(3)

# 評価
print(score_feedback_comp(pred_connect[pred_connect['class'] != 'No Class'], train))

0.7385307463147738


In [18]:
# もう一回推移確率行列
pred_connect['next_class'] = pred_connect.groupby('id')['class'].transform(lambda x: x.shift(-1))
pred_connect_crosstab = pd.crosstab(pred_connect['class'], pred_connect['next_class'], normalize='index', dropna=False)
pred_connect_crosstab.style.background_gradient(axis=1)

next_class,Claim,Concluding Statement,Counterclaim,Evidence,Lead,No Class,Position,Rebuttal
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Claim,0.251518,0.024575,0.019286,0.637537,0.001969,0.051713,0.010082,0.003321
Concluding Statement,0.03905,0.0,0.041689,0.020053,0.001055,0.75409,0.131398,0.012665
Counterclaim,0.072739,0.056926,0.0,0.339026,0.001054,0.027198,0.01012,0.492937
Evidence,0.492177,0.258936,0.078285,0.0,0.003671,0.118872,0.025201,0.022858
Lead,0.230786,0.024276,0.01569,0.063394,0.0,0.020566,0.64338,0.001908
No Class,0.194091,0.047764,0.024641,0.634941,0.054537,0.0,0.038421,0.005606
Position,0.579116,0.045633,0.025984,0.257519,0.000561,0.086775,0.0,0.004411
Rebuttal,0.172868,0.270437,0.050982,0.429241,0.003516,0.055084,0.017873,0.0


対角を結合したらスコアが 0.738 まで改善した。