In [96]:
import pandas as pd
import re
from collections import defaultdict, Counter
import numpy as np
np.random.seed(1337)

In [97]:
input_file_path = 'data/train.csv'
output_file_path = 'data/preprocessed_train_data.csv'
train_data = pd.read_csv(input_file_path)

input_file_path2 = 'data/test_for_inference.csv'
output_file_path2 = 'data/test_for_inference_preprocessed.csv'
test_data = pd.read_csv(input_file_path2)

In [98]:
train_proportion = 0.7
val_proportion = 0.15
test_proportion = 0.15 # 비율이 사용되지는 않음
comments = pd.read_csv(input_file_path, 
                            header=0, names=['comment', 'toxicity'])

by_toxicity = defaultdict(list)
for _, row in comments.iterrows():
    by_toxicity[row.toxicity].append(row.to_dict())
    
comment_subset = []

for _, item_list in sorted(by_toxicity.items()):

    np.random.shuffle(item_list)
    n_total = len(item_list)
    n_train = int(train_proportion * n_total)
    n_val = int(val_proportion * n_total)
    n_test = n_total - (n_train + n_val)  # 남은 데이터를 모두 테스트 세트로 사용
    
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'        
    comment_subset.extend(item_list)
    
final_comments = pd.DataFrame(comment_subset)

In [99]:
def preprocess_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ') # 운영체제에 따른 줄바꿈문자 처리
    text = re.sub(r'[^a-zA-Z.,!?\s]', '', text)
    text = text.lower()
    text = re.sub(' +', ' ', text)
    return text

final_comments['comment'] = final_comments['comment'].apply(preprocess_text)
final_comments['toxicity'] = final_comments['toxicity'].astype(str)

final_comments.to_csv(output_file_path, index=False, columns=['comment', 'toxicity', 'split'])

test_data['comment'] = test_data['comment'].apply(preprocess_text)
test_data.to_csv(output_file_path2, index=False, columns=['comment'])
print(output_file_path)
print(output_file_path2)

data/preprocessed_train_data.csv
data/test_for_inference_preprocessed.csv


In [100]:
preprocessed_train_data = pd.read_csv('data/preprocessed_train_data.csv')
print(preprocessed_train_data.head(10))

                                             comment  toxicity  split
0  indent reset please do. while these look good ...         0  train
1  i would like to known about pointer in java la...         0  train
2  copyright problems hello, . concerning your co...         0  train
3  elbing disambig any objections to redirecting ...         0  train
4  help why am i blocked help help help one more ...         0  train
5   indeed, but odd nature did not write you appe...         0  train
6   to be clear, the reference to each person wen...         0  train
7  i have added a new section, appearances in fic...         0  train
8   fiba it must have been i over saved your revi...         0  train
9            i just removed the whole dumb section.          0  train


In [101]:
preprocessed_inference_data = pd.read_csv('data/test_for_inference_preprocessed.csv')
print(preprocessed_inference_data.head(10))

                                             comment
0  vandalism, or just bad writing? so, i noticed ...
1  congratulations on getting mrio de andrade to ...
2                              link canvassing talk 
3   wpoverlinking policy on wikipediagenerally, a...
4  nomination i am officially nominating tenpound...
5  overall although brief, the article is well wr...
6   after consulting the very wpmosdab, there sho...
7   the hardestworking, most experienced, most in...
8  , january utc declineyou have not addressed an...
9                        redirect talkmar azul album
