# このノートについて

## 目的
* とりあえず提出してみて課題の感じをつかむ

## 流れ
* 必要なライブラリをインポート
* 学習・予測に使うデータを取得
* 特徴エンジニアリング(テキストクリーニング)
* 学習・予測・提出

## 参考

* データの分析およびクリーニングについて
    * *jagan, Stop the S@#$ - Toxic Comments EDA, https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda*
        * ほとんどこの人のやり方をパクった
* モデル構築
    * Bojan Tunguz, Logistic regression with words and char n-grams, https://www.kaggle.com/tunguz/logistic-regression-with-words-and-char-n-grams/code
    
    

# 必要なライブラリをインポート
## nltk
* nltkがインポートエラーならpipで
``` $ pip install nltk ```

In [23]:
# いつもの
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
from tqdm import tqdm

# NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer 

# FeatureEngineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# データのパスなど
home_dir="/home/ubuntu/notebooks/uema/kaggle_toxic/"
data_dir=home_dir+"data/"
result_dir=home_dir+"result/"

pd.set_option("display.max_rows", 300)

## コーパスをダウンロード
* wordnetはレンマ化のため, stopwordsはストップワードのためにダウンロードする

In [5]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 学習・予測に使うデータを取得

In [2]:
train_path = data_dir+"train.csv" 
train_df = pd.read_csv(train_path)
print(train_df.isnull().sum())
train_df

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [3]:
test_path = data_dir+"test.csv" 
test_df = pd.read_csv(test_path)
print(test_df.isnull().sum())
test_df

id              0
comment_text    0
dtype: int64


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


In [31]:
submit_path = data_dir+"sample_submission.csv"
submit_df = pd.read_csv(submit_path)
submit_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5
5,0001ea8717f6de06,0.5,0.5,0.5,0.5,0.5,0.5
6,00024115d4cbde0f,0.5,0.5,0.5,0.5,0.5,0.5
7,000247e83dcc1211,0.5,0.5,0.5,0.5,0.5,0.5
8,00025358d4737918,0.5,0.5,0.5,0.5,0.5,0.5
9,00026d1092fe71cc,0.5,0.5,0.5,0.5,0.5,0.5


# 特徴エンジニアリング
## テキストクリーニング 

In [6]:
#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}
tokenizer=TweetTokenizer()
lem = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))

In [7]:
def clean(comment):
    # 大文字->小文字に変換
    comment = comment.lower()
    # \nを削除
    comment = re.sub("\\n","",comment)
    # (必要なら)いろいろ記号を削除
    #comment = re.sub("\"||\(||\)||\;||\:||\|\=","",comment)
    # ipとユーザを除く
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.d{,3}", "", comment)
    comment = re.sub("\[\[.\]]", "", comment)
    # 文章を単語の配列にする
    words = tokenizer.tokenize(comment)
    # 略語を戻す
    words = [APPO[word] if word in APPO else word for word in words]
    # レンマ化(例:databases -> database等)
    words = [lem.lemmatize(word, "v") for word in words]
    # ストップワードを除去(例 are, by 等)
    wordf = [w for w in words if not w in eng_stopwords]
    # 配列になった単語列を繋げ直して文章化
    clean_sent=" ".join(words)
    return clean_sent

In [8]:
corpus_train = train_df["comment_text"]
corpus_train = corpus_train.apply(lambda x:clean(x))
corpus_train

0         explanationwhy the edit make under my username...
1         d'aww ! he match this background colour I am s...
2         hey man , I am really not try to edit war . it...
3         " morei cannot make any real suggestions on im...
4         you , sir , be my hero . any chance you rememb...
5         " congratulations from me as well , use the to...
6              cocksucker before you piss around on my work
7         your vandalism to the matt shirvington article...
8         sorry if the word ' nonsense ' be offensive to...
9         alignment on this subject and which be contrar...
10        " fair use rationale for image : wonju.jpgthan...
11        bbq be a man and let discuss it-maybe over the...
12        hey ... what be it .. @ | talk . what be it .....
13        before you start throw accusations and warn at...
14        oh , and the girl above start her arguments wi...
15        " juelz santanas agein 2002 , juelz santana be...
16        bye ! do not look , come or th

In [10]:
corpus_test = test_df["comment_text"]
corpus_test = corpus_test.apply(lambda x:clean(x))

## テキストをBoW化

In [35]:
#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    max_features=15000)
X_train = vectorizer.fit_transform(corpus_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [38]:
print(len(vectorizer.get_feature_names()))
vectorizer.get_feature_names()

15000


['00',
 '000',
 '0000',
 '000000',
 '001',
 '004',
 '007',
 '01',
 '02',
 '03',
 '04',
 '0422',
 '05',
 '06',
 '07',
 '08',
 '084080',
 '09',
 '0ll',
 '0px',
 '10',
 '100',
 '1000',
 '10000',
 '1006',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '10th',
 '11',
 '110',
 '111',
 '11111111111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '1185',
 '119',
 '11th',
 '12',
 '120',
 '1200',
 '121',
 '122',
 '123',
 '1234',
 '12345678',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '12th',
 '13',
 '130',
 '1300',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '139',
 '13th',
 '14',
 '140',
 '1400',
 '141',
 '142',
 '143',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',
 '14th',
 '15',
 '150',
 '1500',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '158',
 '159',
 '15th',
 '16',
 '160',
 '1600',
 '161',
 '162',
 '163',
 '164',
 '165',
 '166',
 '167',
 '168',
 '169',
 '16th',
 '17',
 '170',
 '171',
 '172',
 '173',
 '174',
 '1

In [39]:
X_test = vectorizer.transform(corpus_test)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


# 学習
## 

In [40]:
class_names = train_df.columns[2:].tolist()
class_names

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [43]:
losses = []
for class_name in tqdm(class_names):
    y_train = train_df[class_name]
    classifier = LogisticRegression()

    cv_loss = np.mean(cross_val_score(classifier, X_train, y_train, cv=5, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(X_train, y_train)
    submit_df[class_name] = classifier.predict_proba(X_test)[:, 1]

  0%|          | 0/6 [00:00<?, ?it/s]

CV score for class toxic is 0.9727862057898304


 17%|█▋        | 1/6 [00:10<00:50, 10.15s/it]

CV score for class severe_toxic is 0.9846810227133572


 33%|███▎      | 2/6 [00:19<00:39,  9.87s/it]

CV score for class obscene is 0.9850281080475917


 50%|█████     | 3/6 [00:27<00:28,  9.46s/it]

CV score for class threat is 0.9883128867609863


 67%|██████▋   | 4/6 [00:35<00:17,  8.98s/it]

CV score for class insult is 0.9782374956955826


 83%|████████▎ | 5/6 [00:45<00:09,  9.14s/it]

CV score for class identity_hate is 0.9739188928329039


100%|██████████| 6/6 [00:54<00:00,  9.10s/it]


In [42]:
submit_df.to_csv(result_dir+"tdidf_logistic.csv", index=False) 

submit_df

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999092,0.223625,0.995722,0.059600,0.962986,0.311212
1,0000247867823ef7,0.004477,0.001474,0.002425,0.000542,0.004048,0.002482
2,00013b17ad220c46,0.007112,0.001697,0.007116,0.000482,0.004910,0.001013
3,00017563c3f7919a,0.003179,0.001727,0.002305,0.001108,0.003911,0.000805
4,00017695ad8997eb,0.014663,0.002504,0.006503,0.000682,0.006399,0.001617
5,0001ea8717f6de06,0.005681,0.001191,0.002890,0.001415,0.007545,0.001040
6,00024115d4cbde0f,0.007302,0.000587,0.004122,0.000526,0.006319,0.000967
7,000247e83dcc1211,0.350123,0.004812,0.048984,0.002471,0.076616,0.005595
8,00025358d4737918,0.009183,0.001752,0.007571,0.000920,0.007930,0.002773
9,00026d1092fe71cc,0.002195,0.000550,0.002805,0.000459,0.003299,0.000746
