# Import Libraries and Data

In [8]:
import numpy as np
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 300)
pd.get_option("display.max_rows")
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
%matplotlib inline

import os
from os.path import join as opj
import gc

INPUT_PATH = '../../input/feedback-prize-effectiveness/'
LABEL = 'discourse_effectiveness'

In [9]:
train_df = pd.read_csv(opj(INPUT_PATH, 'train.csv'))
test_df = pd.read_csv(opj(INPUT_PATH, 'test.csv'))
sub_df = pd.read_csv(opj(INPUT_PATH, 'sample_submission.csv'))

print('train_df.shape = ', train_df.shape)
print('test_df.shape = ', test_df.shape)
print('sub_df.shape = ', sub_df.shape)

train_df.shape =  (36765, 5)
test_df.shape =  (10, 4)
sub_df.shape =  (10, 4)


In [2]:
unlabeled_df = pd.read_csv('../../input/feedback-prize-2021/train.csv').rename(columns={'id':'essay_id'})
unlabeled_df['discourse_id'] = unlabeled_df['discourse_id'].astype(str)

print('unlabeled_df.shape = ', unlabeled_df.shape)

unlabeled_df.shape =  (144293, 8)


In [3]:
unlabeled_df.head()

Unnamed: 0,essay_id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622627660524.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


# Construct part of full text which includes discourse_text

In [4]:
import pandas as pd
from tqdm import tqdm

def preprocessing(df, num_neighbor=3):
    unique_ids = sorted(df['essay_id'].unique())
    essay_df = []
    for essay_id in tqdm(unique_ids):
        tmp_essay_df = df[df['essay_id']==essay_id].reset_index(drop=True)
        for i in range(len(tmp_essay_df)):
            text = ''
            tmp_df = tmp_essay_df.iloc[max(0,i-num_neighbor):min(i+num_neighbor,len(tmp_essay_df))].reset_index(drop=True)
            for j in range(len(tmp_df)):
                sample = tmp_df.iloc[j]
                text += f'[{sample["discourse_type"].upper()}]{sample["discourse_text"]}'
            tmp_essay_df.loc[i,'neighbor_text'] = text
        essay_df.append(tmp_essay_df)
    essay_df = pd.concat(essay_df).reset_index(drop=True)
    print('essay_df.shape = ', essay_df.shape)
    df = df.merge(essay_df[['discourse_id','neighbor_text']], on='discourse_id', how='left')
    return df

In [5]:
unlabeled_df = preprocessing(unlabeled_df, num_neighbor=3)

100%|█████████████████████████████████████| 15594/15594 [03:11<00:00, 81.36it/s]


essay_df.shape =  (144293, 9)


In [6]:
print(unlabeled_df.shape)

unlabeled_df.head()

(144293, 9)


Unnamed: 0,essay_id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,neighbor_text
0,423A1CA112E2,1622627660524.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,[LEAD]Modern humans today are always on their ...
1,423A1CA112E2,1622627653021.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,[LEAD]Modern humans today are always on their ...
2,423A1CA112E2,1622627671020.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,[LEAD]Modern humans today are always on their ...
3,423A1CA112E2,1622627696365.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,[LEAD]Modern humans today are always on their ...
4,423A1CA112E2,1622627759780.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,[POSITION]They are some really bad consequence...


# Remove Labeled Data

In [10]:
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [11]:
unlabeled_df['essay_id'].unique().shape, train_df['essay_id'].unique().shape

((15594,), (4191,))

In [13]:
len(set(unlabeled_df['essay_id'].unique()) - set(train_df['essay_id'].unique()))

11403

In [14]:
15594 - 4191

11403

In [15]:
unlabeled_df = unlabeled_df[~unlabeled_df['essay_id'].isin(train_df['essay_id'])].reset_index(drop=True)
unlabeled_df.shape

(107528, 9)

In [16]:
unlabeled_df['essay_id'].unique().shape

(11403,)

In [17]:
import os
os.makedirs('./result', exist_ok=True)
unlabeled_df.to_csv('./result/unlabeled.csv', index=False)