In [1]:
from collections import defaultdict
import gensim
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import random
import re
from nltk.corpus import sentiwordnet as swn
from tqdm import tqdm_notebook as tqdm

# 乱数の固定
SEED = 0
# word2vec を学習するか
train_word2vec = False
# text の前処理をするか
text_preprocessing = True
# MIN_COUNT に到達しない単語にも情報があれば付与したり，UNDEFINED などにしたほうが後々楽
MIN_COUNT = 10
debug = True
input_dir = os.path.join("..", "input")
jigsaw_path = "jigsaw-unintended-bias-in-toxicity-classification"



In [2]:
train_df = pd.read_csv(os.path.join(input_dir, jigsaw_path, "train.csv"))
if debug:
    print("train columns", train_df.columns)
    print("train shape", train_df.shape)
train_df.head()

train columns Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'],
      dtype='object')
train shape (1804874, 45)


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [3]:
test_df = pd.read_csv(os.path.join(input_dir, jigsaw_path, "test.csv"))
if debug:
    print("test columns", test_df.columns)
    print("test shape", test_df.shape)
test_df.head()

test columns Index(['id', 'comment_text'], dtype='object')
test shape (97320, 2)


Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...


In [4]:
all_document = train_df["comment_text"].tolist() + test_df["comment_text"].tolist()

In [5]:
# .The などが分かれないので，とりあえず .と大文字に対する分割
prog = re.compile('\.[A-Z]')
# 先頭大文字用
Head_check = re.compile('^[A-Z][^A-Z]+$')

In [6]:
URL_slash = re.compile('^//[A-Za-z/.]+')
URL_www = re.compile('^www[A-Za-z/.]+')

In [7]:
# 先頭大文字への対処．固有名詞には基本触れない
lemmatizer = nltk.stem.WordNetLemmatizer()
def word_compile(word, lemmatize = True):
    # 先頭のみ大文字
    if Head_check.match(word) is not None:
        word = word[0].lower() + word[1:]
    # URL の 統一
    elif URL_slash.match(word) is not None or URL_www.match(word):
        word = "URL_text"
    # 見出し語化
    if lemmatize:
        word = lemmatizer.lemmatize(word)
    return word

if debug:
    print(word_compile("They"))
    print(word_compile("TRUMP"))
    print(word_compile("WordNet"))

they
TRUMP
WordNet


In [8]:
# .が繋がっているとわかれないので
def splitter(text, split_words, include = True):
    split_words = [i.span()[0] for i in split_words]
    word_list = []
    word = ""
    for i in range(len(text)):
        word += text[i]
        if i in split_words:
            word_list.append(word_compile(word[:-1]))
            if include:
                word_list.append(".")
            word = ""
    word_list.append(word_compile(word))
    return word_list

if debug:
    print(splitter("borrow.Incoming", prog.finditer("borrow.Incoming")))

['borrow', '.', 'incoming']


In [9]:
# sentence.split() はnltk.word_torknize(sentence) でも可
def tokenizer(sentence):
    global text_preprocessing
    if text_preprocessing:
        return [word_compile(word) if len(prog.findall(word)) == 0 else splitter(word, prog.finditer(word))[0] 
                      for word in nltk.word_tokenize(sentence)]
    else:
        return sentence.split()

sentences = [tokenizer(sentence) for sentence in all_document]
if debug:
    print(sentences[:2])

[['this', 'is', 'so', 'cool', '.', 'it', "'s", 'like', ',', "'would", 'you', 'want', 'your', 'mother', 'to', 'read', 'this', '?', '?', "'", 'really', 'great', 'idea', ',', 'well', 'done', '!'], ['thank', 'you', '!', '!', 'this', 'would', 'make', 'my', 'life', 'a', 'lot', 'le', 'anxiety-inducing', '.', 'keep', 'it', 'up', ',', 'and', 'do', "n't", 'let', 'anyone', 'get', 'in', 'your', 'way', '!']]


In [10]:
# 出現回数の少ない順にソート（確認のため）
word_count = defaultdict(lambda: 0)
if text_preprocessing:
    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1
    word_count = dict(sorted(word_count.items(), key = lambda kv: kv[1]))

# タイプミスが非常に多い
ネットスラングの方がまずいかなと思ったが，タイプミスや連続する文字の方がまずそう

全部は対応が厳しいので，一度 fasttext は試すべきかも

In [11]:
word_count

{'hahahahahahahahhha': 1,
 'FFFFUUUUUUUUUUUUUUU': 1,
 'better—it': 1,
 'biased—I': 1,
 'debate—we': 1,
 'unpopular—so': 1,
 'works—following': 1,
 'Troll-In-Training': 1,
 'insane-': 1,
 'read*': 1,
 'DIDENT': 1,
 'DOWNHILL': 1,
 'FLOURAIDE': 1,
 'wmcelha': 1,
 'tostones': 1,
 'panopticomments': 1,
 'bundycon': 1,
 'troll-driven': 1,
 'portland_hipster15': 1,
 'mail.app': 1,
 'x-webdoc': 1,
 'bax': 1,
 '.cowboys': 1,
 'browsers..': 1,
 'kigurmi': 1,
 'sazac': 1,
 'onzie': 1,
 'appropiating': 1,
 'vibers': 1,
 'kumoricon': 1,
 '.vary': 1,
 'pizzicato': 1,
 'commenter-facing': 1,
 'Mr_Whiskers': 1,
 'MrWhiskers1': 1,
 '＼': 1,
 '・ω・': 1,
 'gluten-shame': 1,
 'not-awfulness': 1,
 'finGEred': 1,
 'i-pAd': 1,
 'top-most': 1,
 'disruptively-steep': 1,
 'floorpans': 1,
 'burn/char': 1,
 'founder/architect': 1,
 'ell-oh-ell': 1,
 'art-focused': 1,
 'hungry…': 1,
 'cibo': 1,
 '🍕🍕🍕': 1,
 'jail/etc': 1,
 'IV-e': 1,
 'people—either': 1,
 'debate—even': 1,
 'inche': 1,
 'kabin': 1,
 'email-leaking':

In [12]:
!pip list

Package                            Version                 Location      
---------------------------------- ----------------------- --------------
absl-py                            0.7.1                   
alabaster                          0.7.10                  
alembic                            1.0.9                   
algopy                             0.5.7                   
altair                             2.4.1                   
anaconda-client                    1.6.14                  
anaconda-navigator                 1.8.7                   
anaconda-project                   0.8.2                   
annoy                              1.15.2                  
appdirs                            1.4.3                   
arrow                              0.13.1                  
asn1crypto                         0.24.0                  
astor                              0.7.1                   
astroid                            1.6.3                 

wcwidth                            0.1.7                   
webencodings                       0.5.1                   
websocket-client                   0.56.0                  
Werkzeug                           0.14.1                  
wfdb                               2.2.1                   
wheel                              0.31.1                  
widgetsnbextension                 3.2.1                   
Wordbatch                          1.3.8                   
wordcloud                          1.5.0                   
wordsegment                        1.3.1                   
wrapt                              1.10.11                 
xarray                             0.12.1                  
xgboost                            0.82                    
xlrd                               1.1.0                   
XlsxWriter                         1.0.4                   
xlwt                               1.3.0                   
xvfbwrapper             