* EDA for formulation  

# Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 300)
pd.get_option("display.max_rows")
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
%matplotlib inline

import os
from os.path import join as opj
import gc

INPUT_PATH = '../../input/feedback-prize-effectiveness/'

LABEL = 'discourse_effectiveness'

In [2]:
train_df = pd.read_csv(opj(INPUT_PATH, 'train.csv'))
test_df = pd.read_csv(opj(INPUT_PATH, 'test.csv'))
sub_df = pd.read_csv(opj(INPUT_PATH, 'sample_submission.csv'))

print('train_df.shape = ', train_df.shape)
print('test_df.shape = ', test_df.shape)
print('sub_df.shape = ', sub_df.shape)

train_df.shape =  (36765, 5)
test_df.shape =  (10, 4)
sub_df.shape =  (10, 4)


# Check Distribution

# Construct part of full text which includes discourse_text

In [3]:
unique_ids = sorted(train_df['essay_id'].unique())

In [4]:
from tqdm import tqdm

essay_df = []
for essay_id in tqdm(unique_ids):
    tmp_essay_df = train_df[train_df['essay_id']==essay_id].reset_index(drop=True)
    for i in range(len(tmp_essay_df)):
        tmp_df = tmp_essay_df.iloc[i]
        discourse_type = tmp_df['discourse_type']
        
        if discourse_type in ['Lead', 'Position', 'Evidence']:
            feature_df = tmp_essay_df.iloc[max(0,i-3):min(i+3,len(tmp_essay_df))].reset_index(drop=True)
                
        elif discourse_type=='Claim':
            feature_df = tmp_essay_df[tmp_essay_df['discourse_type'].isin(['Position'])].reset_index(drop=True)
            
        elif discourse_type in ['Counterclaim', 'Rebuttal']:
            df1 = tmp_essay_df[tmp_essay_df['discourse_type'].isin(['Position'])].reset_index(drop=True)
            df2 = tmp_essay_df.iloc[max(0,i-3):min(i+3,len(tmp_essay_df))]
            feature_df = pd.concat([df1,df2], axis=0).drop_duplicates().reset_index(drop=True)
            
        elif discourse_type=='Concluding Statement':
            feature_df = tmp_essay_df[tmp_essay_df['discourse_type'].isin(['Position','Claim'])].reset_index(drop=True)
        
        feature_text = ''
        for j in range(len(feature_df)):
            sample = feature_df.iloc[j]
            feature_text += f'[{sample["discourse_type"].upper()}]{sample["discourse_text"]}'
        tmp_essay_df.loc[i,'feature_text'] = feature_text
        
    essay_df.append(tmp_essay_df)
    
essay_df = pd.concat(essay_df).reset_index(drop=True)
print('essay_df.shape = ', essay_df.shape)

100%|██████████████████████████████████████| 4191/4191 [00:33<00:00, 125.79it/s]


essay_df.shape =  (36765, 6)


In [5]:
essay_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,feature_text
0,fe6dfbd53216,00066EA9880D,Driverless cars are exaclty what you would exp...,Lead,Adequate,[LEAD]Driverless cars are exaclty what you wou...
1,ca9e1b60c9fb,00066EA9880D,The developement of these cars should be stopp...,Position,Effective,[LEAD]Driverless cars are exaclty what you wou...
2,6cf2157f4f19,00066EA9880D,the driver will be alerted when they will need...,Claim,Effective,[POSITION]The developement of these cars shoul...
3,d920880f4341,00066EA9880D,This is such a dangerous thing because we all ...,Evidence,Effective,[LEAD]Driverless cars are exaclty what you wou...
4,7d52c3216a0e,00066EA9880D,Another thing that can go wrong with these car...,Claim,Effective,[POSITION]The developement of these cars shoul...


In [6]:
train_df = train_df.merge(essay_df[['discourse_id','feature_text']], on='discourse_id', how='left')

In [7]:
import os
os.makedirs('./result', exist_ok=True)
train_df.to_csv('./result/train.csv', index=False)

In [8]:
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,feature_text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"[LEAD]Hi, i'm Isaac, i'm going to be writing a..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"[LEAD]Hi, i'm Isaac, i'm going to be writing a..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,"[POSITION]On my perspective, I think that the ..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"[LEAD]Hi, i'm Isaac, i'm going to be writing a..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,"[POSITION]On my perspective, I think that the ..."
