In [2]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer
import re
import emoji
import gc

## WIKI

In [2]:
wiki = pd.read_csv("./train/Wiki.csv")

In [3]:
wiki.shape

(159571, 8)

In [4]:
wiki.head(100)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
95,003b9f448ee4a29d,"""\n\nThanks. I can see that violating clearly ...",0,0,0,0,0,0
96,003bd094feef5263,"""\nHi\nThanks for our kind words. See you arou...",0,0,0,0,0,0
97,003caacc6ce6c9e9,Collusion in poker \n\nThis is regarded as mos...,0,0,0,0,0,0
98,003d77a20601cec1,"Thanks much - however, if it's been resolved, ...",0,0,0,0,0,0


In [5]:
wiki.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [7]:
wiki.loc[:, "toxic"].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [8]:
for l in labels:
    print(l, wiki.loc[:, l].value_counts(), "\n")

toxic 0    144277
1     15294
Name: toxic, dtype: int64 

severe_toxic 0    157976
1      1595
Name: severe_toxic, dtype: int64 

obscene 0    151122
1      8449
Name: obscene, dtype: int64 

threat 0    159093
1       478
Name: threat, dtype: int64 

insult 0    151694
1      7877
Name: insult, dtype: int64 

identity_hate 0    158166
1      1405
Name: identity_hate, dtype: int64 



In [24]:
def getAvgEmb(df, label, model, comment_key, ratio=1.0):
    sub_df = df[df.loc[:, label] != 0].sort_values(by=label, axis=0, ascending=False)
    total = sub_df.shape[0]
    sub_df = sub_df[:int(ratio*total)]
    print("processing", sub_df.shape)
    embs = model.encode(sub_df[comment_key].tolist())
    # print(embs)
    return np.average(embs, axis=0) 

In [10]:
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1', device="cuda")

In [11]:
all_embs = []
for l in labels:
    all_embs.append(getAvgEmb(wiki, l, model, "comment_text"))


In [20]:
del wiki

## CIVIL

In [6]:
civil = pd.read_csv("./train/Civil.csv")

In [7]:
civil.shape

(1999516, 46)

In [17]:
civil.columns

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')

In [10]:
Clabels = ["toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack", "sexual_explicit"] # sexual_explicit not in wiki

In [7]:
civil[Clabels].head()

Unnamed: 0,toxicity,severe_toxicity,obscene,threat,insult,identity_attack
0,0.373134,0.044776,0.089552,0.014925,0.343284,0.0
1,0.605263,0.013158,0.065789,0.065789,0.565789,0.092105
2,0.666667,0.015873,0.031746,0.0,0.666667,0.047619
3,0.815789,0.065789,0.552632,0.105263,0.684211,0.0
4,0.55,0.0375,0.3375,0.0,0.4875,0.0375
5,0.20339,0.016949,0.050847,0.0,0.20339,0.033898
6,0.525,0.0125,0.0375,0.0625,0.4625,0.1
7,0.192982,0.0,0.0,0.070175,0.035088,0.0
8,0.803279,0.065574,0.04918,0.016393,0.754098,0.131148
9,0.710526,0.013158,0.065789,0.013158,0.657895,0.013158


In [15]:
civil[Clabels].describe()

Unnamed: 0,toxicity,severe_toxicity,obscene,threat,insult,identity_attack
count,1999516.0,1999516.0,1999516.0,1999516.0,1999516.0,1999516.0
mean,0.1029241,0.004580563,0.01389045,0.009280932,0.08113795,0.02254899
std,0.1970386,0.0228576,0.06466376,0.04933784,0.1760993,0.07854022
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.1666667,0.0,0.0,0.0,0.08456672,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
for l in Clabels:
    print(l, (civil.loc[:, l] != 0).value_counts(), "\n")

toxicity False    1401762
True      597754
Name: toxicity, dtype: int64 

severe_toxicity False    1883681
True      115835
Name: severe_toxicity, dtype: int64 

obscene False    1836473
True      163043
Name: obscene, dtype: int64 

threat False    1881335
True      118181
Name: threat, dtype: int64 

insult False    1496268
True      503248
Name: insult, dtype: int64 

identity_attack False    1757998
True      241518
Name: identity_attack, dtype: int64 



In [23]:
all_Cembs = []
for l in tqdm(Clabels):
    all_Cembs.append(getAvgEmb(civil, l, model, "comment_text", ratio=0.1))

100%|██████████| 6/6 [02:37<00:00, 26.28s/it]


In [25]:
all_Cembs

[array([-6.42740075e-03, -1.77238304e-02,  3.05103697e-03, -3.11351493e-02,
         1.36434156e-02, -1.25212157e-02, -9.12306458e-03,  4.12886264e-03,
         3.15915644e-02, -2.81091034e-02, -1.54072912e-02, -1.81439612e-02,
        -1.57131944e-02, -2.06058025e-02, -1.46123832e-02,  3.36038484e-03,
        -1.73929911e-02, -1.56463664e-02,  7.54034705e-03,  3.56705510e-03,
        -3.26230656e-03,  1.49954632e-02, -2.59975865e-02, -4.64248151e-04,
        -1.40125314e-02,  1.83458629e-04,  1.07668415e-02,  6.70027104e-04,
        -1.73645373e-02, -8.30014031e-08, -4.41372534e-03, -1.47053301e-02,
         3.61936982e-03,  6.78155757e-03, -1.33835082e-03,  1.00589301e-02,
        -4.50910721e-03,  1.49129434e-02,  2.03502420e-02, -4.89658304e-03,
        -1.25348568e-02,  2.56195050e-02,  4.06369055e-03,  6.22715987e-03,
         5.10890922e-03,  2.75201770e-03, -2.68455762e-02, -7.21417091e-05,
        -7.30940746e-03, -4.69527813e-03,  8.08863994e-03,  8.17161892e-03,
        -1.9

In [40]:
for i in range(len(labels)):
    sim = cosine_similarity([all_embs[i]], [all_Cembs[i]]) # good enough sim
    print(labels[i], sim) 

toxic [[0.6282433]]
severe_toxic [[0.42028534]]
obscene [[0.6086144]]
threat [[0.58637583]]
insult [[0.5944496]]
identity_hate [[0.5822398]]


In [27]:
# civil.drop_duplicates(subset=["comment_text"], inplace=True) # duplicates in commment
# civil = civil[civil["comment_text"].str.len().gt(50)] # should have atleast 50 chars

# civil["txt"] = civil["comment_text"].apply(lambda x: preprocess(x)) # preprocess from below
# weights = [1, 2, 1, 1, 1, 1, 1] # toxicity is function of other labels in civil
# civil["score"] = 0
# for i, label in enumerate(Clabels):
#     # many labels are zeros # average might not be the best i think 
#     civil["score"] += (weights[i] * civil[label])

# civil["score"] = civil["score"]/sum(weights)

# nontoxic_df = civil[civil["score"] == 0]
# toxic_df =  civil[civil["score"] != 0]
# print(nontoxic_df.shape, toxic_df.shape)
# # civil.shape

# downsample = pd.concat([nontoxic_df.sample(frac=0.45), toxic_df], ignore_index=True)
# downsample.shape
# downsample = downsample.sample(frac=1).reset_index(drop=True) # shuffle
# downsample[["comment_text", "score"]].to_csv("./train/civil_with_downsample.csv", index=False) # txt


(1195716, 47) (571706, 47)


In [26]:
# print(civil["comment_text"][2])
# print(preprocess(civil["comment_text"][2])) # dont preprocess!!!!
# translateAbuseWords(civil["comment_text"][2])

And Trump continues his lifelong cowardice by not making this announcement himself.

What an awful human being .....
and trump continues his lifelong cowardice by not making this announcement himsel fuck an awful human being .


'And Trump continues his lifelong cowardice by not making this announcement himsel  fuck an awful human being .'

In [7]:
# downsample = pd.read_csv("./train/clean_civil.csv")
# downsample.head()
# idx = downsample["txt"].str.split().str.len().idxmax()
# len(downsample["txt"][idx].split())
# asd = downsample["txt"].values#
asd[[0,1,2]]
# del asd
# gc.collect()

array(['you are now sounding confused  you should of stopped w the one comment x ',
       'denver passed the law it did because the  good illegals  and their families and friends were afraid to turn the  bad illegals  in to police out of fear their cooperation with police would lead to their own deportation  that means crime goes unreported and justice is denied to denver residents who are victims of their crime ',
       'tell us  melania  what was it that first attracted you to millionaire donald trump '],
      dtype=object)

## Testing SentenceTransformer

In [None]:
wiki.loc[6]

id                                           0002bcb3da6cb337
comment_text     COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
toxic                                                       1
severe_toxic                                                1
obscene                                                     1
threat                                                      0
insult                                                      1
identity_hate                                               0
Name: 6, dtype: object

In [None]:
model.encode(wiki.loc[6, "comment_text"]) # NO  gpu

array([-2.54649129e-02, -8.34596067e-05, -4.50810418e-02, -5.03016748e-02,
        3.64921018e-02, -6.14717193e-02,  4.10313010e-02,  2.73257550e-02,
       -1.55926608e-02,  2.41791457e-03, -6.99439496e-02,  8.62122606e-03,
       -4.95885732e-03,  5.00570983e-02, -5.98311238e-02, -6.28236532e-02,
       -1.69931892e-02, -3.59678157e-02,  3.00908368e-02,  3.59595902e-02,
        1.63833629e-02,  6.90566972e-02, -4.41116430e-02, -3.86650860e-02,
       -8.71962309e-03,  3.46070603e-02,  1.63976569e-02, -2.74999440e-02,
       -4.92036901e-02,  1.18618626e-02, -1.24798110e-03, -2.64517907e-02,
       -1.85119510e-02,  2.07509305e-02,  5.81298210e-03, -3.92441861e-02,
        3.19686085e-02, -1.94483541e-03, -2.42674053e-02,  4.41726185e-02,
        2.66867112e-02, -4.39015515e-02, -2.97297575e-02, -1.16809399e-03,
       -2.36776546e-02, -1.75332446e-02,  4.08588499e-02,  2.78872699e-02,
        3.87955233e-02,  1.69089679e-02,  1.48447221e-02, -4.47656587e-02,
       -3.58563699e-02, -

## Ruddit

In [2]:
ruddit = pd.read_csv("./train/ruddit_with_text.csv")
ruddit.shape

(5838, 5)

In [3]:
ruddit.head()

Unnamed: 0,post_id,comment_id,txt,url,score
0,42g75o,cza1q49,> The difference in average earnings between m...,https://www.reddit.com/r/changemyview/comments...,-0.083
1,42g75o,cza1wdh,"The myth is that the ""gap"" is entirely based o...",https://www.reddit.com/r/changemyview/comments...,-0.022
2,42g75o,cza23qx,[deleted],https://www.reddit.com/r/changemyview/comments...,0.167
3,42g75o,cza2bw8,The assertion is that women get paid less for ...,https://www.reddit.com/r/changemyview/comments...,-0.146
4,42g75o,cza2iji,You said in the OP that's not what they're mea...,https://www.reddit.com/r/changemyview/comments...,-0.083


In [4]:
ruddit["txt"].value_counts() # remove [deleted], [removed]

[deleted]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      116
[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [5]:
ruddit = ruddit[ruddit["txt"] != "[removed]"]
ruddit = ruddit[ruddit["txt"] != "[deleted]"]
ruddit.shape

(5710, 5)

In [9]:
# ruddit["comment_id"].value_counts() # comment_id is unique
ruddit[["comment_id", "txt", "score"]].to_csv("./train/removed_redundant_ruddit_with_text.csv", index=False)

In [4]:
print(ruddit.loc[0, "txt"])
# ruddit["txt"].apply(lambda x: x.strip("\n")).value_counts()

> The difference in average earnings between men and women can be explained by taking into account relevant factors.

So it isn't a myth, you just feel that you can explain it.


In [8]:
#----------------------------------- Preprocessing-------------------------------------#
SYMBOLS_TO_ISOLATE = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
SYMBOLS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—\n🍕\r🐵\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'
ISOLATE_DICT = {ord(c):f' {c} ' for c in SYMBOLS_TO_ISOLATE}
REMOVE_DICT = {ord(c):f' ' for c in SYMBOLS_TO_REMOVE}
CONTRACTION_MAPPING = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [9]:
### not the best processing # eg lossing emojis info ###
def handle_punctuation(text):
    text = text.translate(REMOVE_DICT)
    text = text.translate(ISOLATE_DICT)
    return text

def clean_contractions(text, mapping=CONTRACTION_MAPPING):
    '''
    Expand contractions
    '''
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def preprocess(x):
    x = handle_punctuation(x)
    x = clean_contractions(x)
    return x

In [53]:
# ">" in REMOVE_DICT
# ruddit.loc[0, "txt"].translate(REMOVE_DICT)
ruddit["txt"] = ruddit["txt"].apply(lambda x: preprocess(x))

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # "GroNLP/hateBERT"

In [8]:
# tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do") # hate bert from transfomers doesnt do emoji conversion as in paper
line = "Don't you love 🤗 Transformers? We sure do. @Tejas https://arxiv.org/pdf/2010.12472.pdf"
print(line)
full_line = re.sub(r'#([^ ]*)', r'\1', line)
full_line = re.sub(r'https.*[^ ]', 'URL', full_line)
full_line = re.sub(r'http.*[^ ]', 'URL', full_line)
full_line = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)', '@USER', full_line)
full_line = emoji.demojize(full_line)
full_line = re.sub(r'(:.*?:)', r' \1 ', full_line)
full_line = re.sub(' +', ' ', full_line)
print(full_line)
print(tokenizer.tokenize(line))
print(tokenizer.tokenize(full_line))

print(tokenizer.encode(full_line))

Don't you love 🤗 Transformers? We sure do. @Tejas https://arxiv.org/pdf/2010.12472.pdf
Don't you love :smiling_face_with_open_hands: Transformers? We sure do. @USER URL
['don', "'", 't', 'you', 'love', '[UNK]', 'transformers', '?', 'we', 'sure', 'do', '.', '@', 'te', '##jas', 'https', ':', '/', '/', 'ar', '##xi', '##v', '.', 'org', '/', 'pdf', '/', '2010', '.', '124', '##7', '##2', '.', 'pdf']
['don', "'", 't', 'you', 'love', ':', 'smiling', '_', 'face', '_', 'with', '_', 'open', '_', 'hands', ':', 'transformers', '?', 'we', 'sure', 'do', '.', '@', 'user', 'ur', '##l']
[101, 2123, 1005, 1056, 2017, 2293, 1024, 5629, 1035, 2227, 1035, 2007, 1035, 2330, 1035, 2398, 1024, 19081, 1029, 2057, 2469, 2079, 1012, 1030, 5310, 24471, 2140, 102]


In [9]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("piss off")) # ruddit.loc[0, "txt"]

['[CLS]', 'piss', 'off', '[SEP]']

In [24]:
RE_PATTERNS = {
    ' american ':
        [
            'amerikan'
        ],
    ' adolf ':
        [
            'adolf'
        ],
    ' hitler ':
        [
            'hitler'
        ],
    ' fuck':
        [
            '(f)(u|[^a-z0-9 ])(c|[^a-z0-9 ])(k|[^a-z0-9 ])([^ ])*',
            '(f)([^a-z]*)(u)([^a-z]*)(c)([^a-z]*)(k)',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*k', 'f u u c',
            '(f)(c|[^a-z ])(u|[^a-z ])(k)', #r'f\*',
            'feck ', ' fux ', 'f\*\*', 
            'f\-ing', 'f\.u\.', 'f###', ' fu ', 'f@ck', 'f u c k', 'f uck', 'f ck'
        ],
    ' ass ':
        [
            '[^a-z]ass ', '[^a-z]azz ', 'arrse', ' arse ', '@\$\$'
                                                           '[^a-z]anus', ' a\*s\*s', '[^a-z]ass[^a-z ]',
            'a[@#\$%\^&\*][@#\$%\^&\*]', '[^a-z]anal ', 'a s s'
        ],
    ' ass hole ':
        [
            ' a[s|z]*wipe', 'a[s|z]*[w]*h[o|0]+[l]*e', '@\$\$hole'
        ],
    ' bitch ':
        [
            'b[w]*i[t]*ch', 'b!tch',
            'bi\+ch', 'b!\+ch', '(b)([^a-z]*)(i)([^a-z]*)(t)([^a-z]*)(c)([^a-z]*)(h)',
            'biatch', 'bi\*\*h', 'bytch', 'b i t c h'
        ],
    ' bastard ':
        [
            'ba[s|z]+t[e|a]+rd'
        ],
    ' trans gender':
        [
            'transgender'
        ],
    ' gay ':
        [
            'gay'
        ],
    ' cock ':
        [
            '[^a-z]cock', 'c0ck', '[^a-z]cok ', 'c0k', '[^a-z]cok[^aeiou]', ' cawk',
            '(c)([^a-z ])(o)([^a-z ]*)(c)([^a-z ]*)(k)', 'c o c k'
        ],
    ' dick ':
        [
            ' dick[^aeiou]', 'deek', 'd i c k'
        ],
    ' suck ':
        [
            'sucker', '(s)([^a-z ]*)(u)([^a-z ]*)(c)([^a-z ]*)(k)', 'sucks', '5uck', 's u c k'
        ],
    ' cunt ':
        [
            'cunt', 'c u n t'
        ],
    ' bull shit ':
        [
            'bullsh\*t', 'bull\$hit'
        ],
    ' homo sex ual':
        [
            'homosexual'
        ],
    ' jerk ':
        [
            'jerk'
        ],
    ' idiot ':
        [
            'i[d]+io[t]+', '(i)([^a-z ]*)(d)([^a-z ]*)(i)([^a-z ]*)(o)([^a-z ]*)(t)', 'idiots'
                                                                                      'i d i o t'
        ],
    ' dumb ':
        [
            '(d)([^a-z ]*)(u)([^a-z ]*)(m)([^a-z ]*)(b)'
        ],
    ' shit ':
        [
            'shitty', '(s)([^a-z ]*)(h)([^a-z ]*)(i)([^a-z ]*)(t)', 'shite', '\$hit', 's h i t'
        ],
    ' shit hole ':
        [
            'shythole'
        ],
    ' retard ':
        [
            'returd', 'retad', 'retard', 'wiktard', 'wikitud'
        ],
    ' rape ':
        [
            ' raped'
        ],
    ' dumb ass':
        [
            'dumbass', 'dubass'
        ],
    ' ass head':
        [
            'butthead'
        ],
    ' sex ':
        [
            'sexy', 's3x', 'sexuality'
        ],
    ' nigger ':
        [
            'nigger', 'ni[g]+a', ' nigr ', 'negrito', 'niguh', 'n3gr', 'n i g g e r'
        ],
    ' shut the fuck up':
        [
            'stfu'
        ],
    ' pussy ':
        [
            'pussy[^c]', 'pusy', 'pussi[^l]', 'pusses'
        ],
    ' faggot ':
        [
            'faggot', ' fa[g]+[s]*[^a-z ]', 'fagot', 'f a g g o t', 'faggit',
            '(f)([^a-z ]*)(a)([^a-z ]*)([g]+)([^a-z ]*)(o)([^a-z ]*)(t)', 'fau[g]+ot', 'fae[g]+ot',
        ],
    ' mother fucker':
        [
            ' motha ', ' motha f', ' mother f', 'motherucker',
        ],
    ' whore ':
        [
            'wh\*\*\*', 'w h o r e'
        ],
}

In [25]:
CONTRACTION_MAPPING = {"ain't": "is not", "'cause": "because", "could've": "could have", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def translateAbuseWords(text, patterns=RE_PATTERNS):
    text = re.sub(r"(.)\1{2,}", r'\1', text)
    for target, patterns in patterns.items():
        for pat in patterns:
            text = re.sub(pat, target, text)
    return text


def clean_contractions(text, mapping=CONTRACTION_MAPPING):
    '''
    Expand contractions
    '''
     
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def social_media_clean(full_line):
    full_line = re.sub(r'#([^ ]*)', r'\1', full_line) # #BanTrump -> BanTrump
    full_line = re.sub(r'https?://\S+|www\.\S+', ' ', full_line) # URL -> " "
    full_line = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)', ' ', full_line) # @user -> " "
    full_line = emoji.demojize(full_line) # emoji -> text
    full_line = re.sub(r'(:.*?:)', r' \1 ', full_line) # :emoji-desc: -> emoji-desc
    full_line = re.sub(' +', ' ', full_line) #  extra blank spaces have been replaced with a single space.

    # repattern = re.compile(r"(.)\1{2,}", re.DOTALL) 
    # ds = ds.str.replace(repattern, r"\1")
    return full_line


def preprocess(full_line):
    full_line = full_line.lower()
    full_line = clean_contractions(full_line)
    full_line = translateAbuseWords(full_line)
    full_line = social_media_clean(full_line)
    # full_line = re.sub(r"[^a-zA-Z\d]", " ", full_line) # messes with emoji
    return full_line

In [104]:
full_line = "Don't you love 🤗 Transformers? We sure do. @Tejas https://arxiv.org/pdf/2010.12472.pdf.  Fuuuuuuuuuck"
# translateAbuseWords(full_line)
# print(emoji.demojize(full_line))
# full_line = clean_contractions(full_line)
# full_line = translateAbuseWords(full_line)
# full_line = social_media_clean(full_line)
full_line = preprocess(full_line)
print(full_line)
# print(tokenizer.tokenize(full_line))
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(full_line)))

don t you love  smiling face with open hands  transformers  we sure do  
['[CLS]', 'don', 't', 'you', 'love', 'smiling', 'face', 'with', 'open', 'hands', 'transformers', 'we', 'sure', 'do', '[SEP]']


In [106]:
ruddit["txt"] = ruddit["txt"].apply(lambda x: preprocess(x)) # rest redandunt symbols are taken care by tokenizers 
ruddit["txt"].head(), ruddit.shape

(0      the difference in average earnings between m...
 1    the myth is that the  gap  is entirely based o...
 3    the assertion is that women get paid less for ...
 4    you said in the op that is not what they are m...
 5     men and women are not payed less for the same...
 Name: txt, dtype: object,
 (5710, 5))

In [84]:
print(tokenizer.tokenize(ruddit.loc[0, "txt"]))
print(tokenizer.encode(ruddit.loc[0, "txt"]))

['>', 'the', 'difference', 'in', 'average', 'earnings', 'between', 'men', 'and', 'women', 'can', 'be', 'explained', 'by', 'taking', 'into', 'account', 'relevant', 'factors', '.', 'so', 'it', 'is', 'not', 'a', 'myth', ',', 'you', 'just', 'feel', 'that', 'you', 'can', 'explain', 'it', '.']
[101, 1028, 1996, 4489, 1999, 2779, 16565, 2090, 2273, 1998, 2308, 2064, 2022, 4541, 2011, 2635, 2046, 4070, 7882, 5876, 1012, 2061, 2009, 2003, 2025, 1037, 10661, 1010, 2017, 2074, 2514, 2008, 2017, 2064, 4863, 2009, 1012, 102]


In [105]:
gc.collect()

13

In [107]:
ruddit.columns

Index(['post_id', 'comment_id', 'txt', 'url', 'offensiveness_score'], dtype='object')

In [110]:
ruddit[["txt", "offensiveness_score"]].to_csv("./train/clean_ruddit_with_text.csv", index=False)