# Preprocessing

* 컬럼명은 다음과 같이 
    * comment, score
* 지울 것(Dataset에서 수행)
    * 개행
    * 쌍따옴표
    * 도메인주소, ip address
* 소문자화(tokenizer에서 수행)
* scoring
    * 레이블당 기준
        * 나머지는 그냥하면되고
        * severe_toxic을 어떻게 처리할 것인지
            * 직접 보고 판단 : toxic 보다 더 offensive한지, 맞다면 가중치를 더 주는 방향으로
            * servere_toxic이 더 offensive하다. 가중치를 부여한다.
                * 1st : servere_toxic만 2로
                * 2nd : servere_toxic과 toxic의 상관관계의 역수 (약 2.5)를 servere_toxic에 곱해서 사용한다.
    * method
        * sum : 단순 합은 부적절
        * avg
            * “발생 평균” : 선정
        * max
        * weighted sum/mean
    * ruddit data
        * [-1, 1] —> [0,1] 

## 1st Data

In [2]:
import os
import pandas as pd
import numpy as np

# min max scaling (0~1 범위로 변환)
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

In [26]:
# read data
train_1st = pd.read_csv('1st/train.csv')
# sample
train_1st.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [33]:
# severe_toxic은 가중치 부여 (1->2)
train_1st.loc[train_1st['severe_toxic']==1, 'severe_toxic'] = 2
train_1st['severe_toxic'].value_counts()

0    157976
2      1595
Name: severe_toxic, dtype: int64

In [36]:
# 발생 평균
train_1st['score'] = train_1st.iloc[:, 2:].replace(0, float('NaN')).mean(axis=1, skipna=True)
train_1st.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,


In [37]:
train_1st['score'] = train_1st['score'].fillna(0)

In [38]:
train_1st['score'].value_counts()

0.000000    143346
1.000000     14630
1.250000      1008
1.200000       329
1.333333       186
1.500000        41
1.166667        31
Name: score, dtype: int64

In [41]:
train_1st = train_1st[['comment_text', 'score']]
train_1st.columns = ['comment', 'score']
train_1st.head()

Unnamed: 0,comment,score
0,Explanation\nWhy the edits made under my usern...,0.0
1,D'aww! He matches this background colour I'm s...,0.0
2,"Hey man, I'm really not trying to edit war. It...",0.0
3,"""\nMore\nI can't make any real suggestions on ...",0.0
4,"You, sir, are my hero. Any chance you remember...",0.0


In [42]:
train_1st.to_csv('data/1st/train_1st.csv')

In [63]:
# read data
test = pd.read_csv('data/1st/test.csv')
test_labels = pd.read_csv('data/1st/test_labels.csv')
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [64]:
test = pd.concat([test, test_labels.iloc[:, 1:]], axis=1)

In [65]:
test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [62]:
test_labels['insult'].value_counts()

-1    89186
 0    60551
 1     3427
Name: insult, dtype: int64

In [66]:
len(test)

153164

In [67]:
# -1 인 데이터는 지움
test= test[test['toxic'] != -1]
len(test)

63978

In [68]:
# severe_toxic은 가중치 부여 (1->2)
test.loc[test['severe_toxic']==1, 'severe_toxic'] = 2
test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [69]:
test.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,63978.0,63978.0,63978.0,63978.0,63978.0,63978.0
mean,0.095189,0.011473,0.057692,0.003298,0.053565,0.011129
std,0.293478,0.151043,0.233161,0.057334,0.22516,0.104905
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,2.0,1.0,1.0,1.0,1.0


In [70]:
# 발생 평균
test['score'] = test.iloc[:, 2:].replace(0, float('NaN')).mean(axis=1, skipna=True)
test['score'] = test['score'].fillna(0)
test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,0.0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,0.0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,0.0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,0.0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,0.0


In [71]:
test['score'].value_counts()

0.000000    57735
1.000000     5876
1.250000      185
1.200000      140
1.333333       28
1.166667       14
Name: score, dtype: int64

In [72]:
test = test[['comment_text', 'score']]
test.columns = ['comment', 'score']
test.head()

Unnamed: 0,comment,score
5,Thank you for understanding. I think very high...,0.0
7,:Dear god this site is horrible.,0.0
11,"""::: Somebody will invariably try to add Relig...",0.0
13,""" \n\n It says it right there that it IS a typ...",0.0
14,""" \n\n == Before adding a new product to the l...",0.0


In [73]:
train_1st = pd.concat([train_1st, test])
print('len : ',len(train_1st))
train_1st.head()

len :  223549


Unnamed: 0,comment,score
0,Explanation\nWhy the edits made under my usern...,0.0
1,D'aww! He matches this background colour I'm s...,0.0
2,"Hey man, I'm really not trying to edit war. It...",0.0
3,"""\nMore\nI can't make any real suggestions on ...",0.0
4,"You, sir, are my hero. Any chance you remember...",0.0


In [None]:
train_1st['score'] = min_max_scaler.fit_transform(train_1st['score'].to_numpy().reshape(-1,1)).reshape(-1)
train_1st.describe()

In [74]:
train_1st.to_csv('data/1st/train_1st.csv', index=False)

## Ruddit

In [35]:
ruddit_path = 'ruddit/ruddit_with_text_1.csv'
ruddit_data = pd.read_csv(ruddit_path)
ruddit_data.head()

Unnamed: 0,post_id,comment_id,txt,url,offensiveness_score
0,42g75o,cza1q49,> The difference in average earnings between m...,https://www.reddit.com/r/changemyview/comments...,-0.083
1,42g75o,cza1wdh,"The myth is that the ""gap"" is entirely based o...",https://www.reddit.com/r/changemyview/comments...,-0.022
2,42g75o,cza23qx,[deleted],https://www.reddit.com/r/changemyview/comments...,0.167
3,42g75o,cza2bw8,The assertion is that women get paid less for ...,https://www.reddit.com/r/changemyview/comments...,-0.146
4,42g75o,cza2iji,You said in the OP that's not what they're mea...,https://www.reddit.com/r/changemyview/comments...,-0.083


In [36]:
# select columns and rename columns
ruddit_data = ruddit_data[['txt', 'offensiveness_score']]
ruddit_data.columns = ['comment', 'score']

In [37]:
# remove '[deleted]' rows
ruddit_data = ruddit_data[ruddit_data['comment'] != '[deleted]']
len(ruddit_data)

453

In [38]:
ruddit_data['score'] = min_max_scaler.fit_transform(ruddit_data['score'].to_numpy().reshape(-1,1)).reshape(-1)
ruddit_data.describe()

Unnamed: 0,score
count,453.0
mean,0.477985
std,0.171724
min,0.0
25%,0.375457
50%,0.482104
75%,0.588751
max,1.0


In [39]:
ruddit_data.head()

Unnamed: 0,comment,score
0,> The difference in average earnings between m...,0.436815
1,"The myth is that the ""gap"" is entirely based o...",0.481373
3,The assertion is that women get paid less for ...,0.390796
4,You said in the OP that's not what they're mea...,0.436815
5,>Men and women are not payed less for the same...,0.466764


In [41]:
ruddit_data.to_csv('ruddit.csv', index=False)

## 2nd

In [3]:
# load raw data
dir = '2nd/jigsaw-unintended-bias-in-toxicity-classification'
raw_data = pd.read_csv(os.path.join(dir,'all_data.csv'))
raw_data.shape

(1999516, 46)

In [4]:
raw_data.head()

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
0,1083994,He got his money... now he lies in wait till a...,train,2017-03-06 15:21:53.675241+00,21,,317120,approved,0,0,...,,,,,,,,,0,67
1,650904,Mad dog will surely put the liberals in mental...,train,2016-12-02 16:44:21.329535+00,21,,154086,approved,0,0,...,,,,,,,,,0,76
2,5902188,And Trump continues his lifelong cowardice by ...,train,2017-09-05 19:05:32.341360+00,55,,374342,approved,1,0,...,,,,,,,,,0,63
3,7084460,"""while arresting a man for resisting arrest"".\...",test,2016-11-01 16:53:33.561631+00,13,,149218,approved,0,0,...,,,,,,,,,0,76
4,5410943,Tucker and Paul are both total bad ass mofo's.,train,2017-06-14 05:08:21.997315+00,21,,344096,approved,0,0,...,,,,,,,,,0,80


### target dataset format
- raw data to train dataset : ['comment', 'score']
- comment : comment text
- score : weighted average of toxicity features value, finally used min-max normalization

### toxicity features
- features : toxicity, severe_toxicity, obscene, sexual_explicit, identity_attack, insult, threat
- toxicity subtype features whitout "severe_toxicity" are counted with equal weight 1.0
- severe_toxicity is set weight 2.0

In [5]:
# using comment_text and tocixity features
raw_data = raw_data[['comment_text', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack', 'insult', 'threat']]
raw_data.head()

Unnamed: 0,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat
0,He got his money... now he lies in wait till a...,0.373134,0.044776,0.089552,0.014925,0.0,0.343284,0.014925
1,Mad dog will surely put the liberals in mental...,0.605263,0.013158,0.065789,0.013158,0.092105,0.565789,0.065789
2,And Trump continues his lifelong cowardice by ...,0.666667,0.015873,0.031746,0.0,0.047619,0.666667,0.0
3,"""while arresting a man for resisting arrest"".\...",0.815789,0.065789,0.552632,0.592105,0.0,0.684211,0.105263
4,Tucker and Paul are both total bad ass mofo's.,0.55,0.0375,0.3375,0.275,0.0375,0.4875,0.0


In [6]:
# weighted 2.0 'severe_toxicity'
raw_data.loc[raw_data['severe_toxicity'] != 0, 'severe_toxicity'] *= 2.0
raw_data.head()

Unnamed: 0,comment_text,toxicity,severe_toxicity,obscene,sexual_explicit,identity_attack,insult,threat
0,He got his money... now he lies in wait till a...,0.373134,0.089552,0.089552,0.014925,0.0,0.343284,0.014925
1,Mad dog will surely put the liberals in mental...,0.605263,0.026316,0.065789,0.013158,0.092105,0.565789,0.065789
2,And Trump continues his lifelong cowardice by ...,0.666667,0.031746,0.031746,0.0,0.047619,0.666667,0.0
3,"""while arresting a man for resisting arrest"".\...",0.815789,0.131579,0.552632,0.592105,0.0,0.684211,0.105263
4,Tucker and Paul are both total bad ass mofo's.,0.55,0.075,0.3375,0.275,0.0375,0.4875,0.0


In [7]:
# weighted avg without zero values
raw_data = raw_data.replace(0.0, np.NaN)
means=raw_data.mean(axis=1, skipna=True)
print(means)

df = pd.DataFrame(columns=['comment', 'score'])
df['comment'] = raw_data['comment_text']
df['score'] = means
df['score'] = df['score'].fillna(0.0)

df.head()

0          0.154229
1          0.204887
2          0.288889
3          0.480263
4          0.293750
             ...   
1999511    0.300000
1999512    0.400000
1999513    0.260000
1999514    0.233333
1999515    0.275000
Length: 1999516, dtype: float64


Unnamed: 0,comment,score
0,He got his money... now he lies in wait till a...,0.154229
1,Mad dog will surely put the liberals in mental...,0.204887
2,And Trump continues his lifelong cowardice by ...,0.288889
3,"""while arresting a man for resisting arrest"".\...",0.480263
4,Tucker and Paul are both total bad ass mofo's.,0.29375


In [None]:
df['score'] = min_max_scaler.fit_transform(df['score'].to_numpy().reshape(-1,1)).reshape(-1)
df.describe()

In [None]:
df.to_csv('2nd/jigsaw_unintended_preprocessing.csv', index=False)

In [8]:
# # normalize
# df = pd.read_csv('2nd/jigsaw_unintended_preprocessing.csv')
# df['score'] = min_max_scaler.fit_transform(df['score'].to_numpy().reshape(-1,1)).reshape(-1)
# df.to_csv('2nd/jigsaw_unintended_preprocessing.csv', index=False)
# df.describe()

Unnamed: 0,score
count,622373.0
mean,0.207814
std,0.110066
min,0.0
25%,0.132936
50%,0.159615
75%,0.249068
max,1.0


## split to train/dev

In [9]:
# read data
df_1st = pd.read_csv('1st/train_1st.csv')
df_2nd = pd.read_csv('2nd/jigsaw_unintended_preprocessing.csv')
df_ruddit = pd.read_csv('ruddit/ruddit.csv')

In [11]:
from sklearn.model_selection import train_test_split
train_1st, dev_1st = train_test_split(df_1st, test_size=0.2, random_state=42, shuffle=True)
train_2nd, dev_2nd = train_test_split(df_2nd, test_size=0.2, random_state=42, shuffle=True)
train_ruddit, dev_ruddit = train_test_split(df_ruddit, test_size=0.2, random_state=42, shuffle=True)

## Merge dataset

In [12]:
train_df = pd.concat([train_1st, train_2nd, train_ruddit])
dev_df = pd.concat([dev_1st, dev_2nd, dev_ruddit])
print(f"train : {len(train_df)}, dev : {len(dev_df)}")

train : 1778813, dev : 444705


### clean

In [None]:
def clean(data, col):
    '''
    clean text
    '''
    # Clean some punctutations
    # data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}|\n',' ')
    # filter ibans(국제계좌형식)
    # filter email
    # filter websites
    # filter phone number
    # quotation marks
    pattern = r'(fr\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{2}|fr\d{20}|fr[ ]\d{2}[ ]\d{3}[ ]\d{3}[ ]\d{3}[ ]\d{5})|' \
               '((?:(?!.*?[.]{2})[a-zA-Z0-9](?:[a-zA-Z0-9.+!%-]{1,64}|)|\"[a-zA-Z0-9.+!% -]{1,64}\")@[a-zA-Z0-9][a-zA-Z0-9.-]+(.[a-z]{2,}|.[0-9]{1,}))|' \
               '((https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*))|' \
               '([0-9]+.[0-9]+.[0-9]+.[0-9]+)|' \
               '((?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})|(\d{2}[ ]\d{2}[ ]\d{3}[ ]\d{3}))|' \
               '\"'
    data[col] = data[col].str.replace(pattern, '')
    
    return data

In [None]:
train_df = clean(train_df,'comment')
dev_df = clean(dev_df,'comment')

### save

In [14]:
train_df.to_csv('train.csv', index=False)
dev_df.to_csv('dev.csv', index=False)