In [26]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('words')
from utils import *
from sklearn.utils import shuffle

[nltk_data] Downloading package punkt to /home/yoonwoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/yoonwoo/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
train_org = pd.read_csv("train.csv")
test_org = pd.read_csv("test.csv")

In [3]:
train = train_org[[
    'comment_text',
    'target',
    'physical_disability',
    'intellectual_or_learning_disability',
    'psychiatric_or_mental_illness',
    'other_disability'
    ]]

### Add column : toxic_class - Target
  - Very Toxic (a very hateful, aggressive, or disrespectful comment that is very likely to make you leave a discussion or give up on sharing your perspective)
  - Toxic (a rude, disrespectful, or unreasonable comment that is somewhat likely to make you leave a discussion or give up on sharing your perspective)
  - Hard to Say
  - Not Toxic

In [6]:
train['toxic_class'] = np.where(
    train['target'] >= 0.75, "1VeryToxic",
    np.where(train['target'] >= 0.5, "2Toxic",
             np.where(train['target'] >= 0.25, "3HardtoSay", "4NotToxic"))
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['toxic_class'] = np.where(


### Add column : Disability binary

In [7]:
train['disability_bin'] = np.where(
    (train['physical_disability'] > 0) | (train['intellectual_or_learning_disability'] > 0) | (train['psychiatric_or_mental_illness'] > 0) | (train['other_disability'] > 0), 'Yes', 'No'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['disability_bin'] = np.where(


### NULL -> 0

In [8]:
train.isnull().sum()

comment_text                                 0
target                                       0
physical_disability                    1399744
intellectual_or_learning_disability    1399744
psychiatric_or_mental_illness          1399744
other_disability                       1399744
toxic_class                                  0
disability_bin                               0
dtype: int64

In [9]:
train = train.fillna(0)
train.isnull().sum()

comment_text                           0
target                                 0
physical_disability                    0
intellectual_or_learning_disability    0
psychiatric_or_mental_illness          0
other_disability                       0
toxic_class                            0
disability_bin                         0
dtype: int64

### Add column : Disability Category
- physical_cat, il_cat, pm_cat, other_cat
- >=0.75 : 1, >=0.5 : 2, >= 0.25 : 3, else 4

In [10]:
train['physical_cat'] = np.where(
    train['physical_disability'] >= 0.75, "1",
    np.where(train['physical_disability'] >= 0.5, "2",
             np.where(train['physical_disability'] >= 0.25, "3", "4"))
)

In [11]:
train['il_cat'] = np.where(
    train['intellectual_or_learning_disability'] >= 0.75, "1",
    np.where(train['intellectual_or_learning_disability'] >= 0.5, "2",
             np.where(train['intellectual_or_learning_disability'] >= 0.25, "3", "4"))
)

In [12]:
train['pm_cat'] = np.where(
    train['psychiatric_or_mental_illness'] >= 0.75, "1",
    np.where(train['psychiatric_or_mental_illness'] >= 0.5, "2",
             np.where(train['psychiatric_or_mental_illness'] >= 0.25, "3", "4"))
)

In [13]:
train['other_cat'] = np.where(
    train['other_disability'] >= 0.75, "1",
    np.where(train['other_disability'] >= 0.5, "2",
             np.where(train['other_disability'] >= 0.25, "3", "4"))
)

## Part 3. Cleansing
- remove unnecessary words, emoji...
- sampling
- tokenize
- save : v1.csv

In [15]:
train['tokenized'] = train['comment_text'].apply(lambda x: clean_text(x))

In [17]:
train.head()

Unnamed: 0,comment_text,target,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,toxic_class,disability_bin,physical_cat,il_cat,pm_cat,other_cat,tokenized
0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,4NotToxic,No,4,4,4,4,"this is so cool. it is like, 'would you want y..."
1,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,4NotToxic,No,4,4,4,4,thank you!! this would make my life a lot less...
2,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,4NotToxic,No,4,4,4,4,this is such an urgent design problem; kudos t...
3,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,4NotToxic,No,4,4,4,4,is this something i will be able to install on...
4,haha you guys are a bunch of losers.,0.893617,0.0,0.25,0.0,0.0,1VeryToxic,Yes,4,3,4,4,haha you guys are a bunch of losers.


In [20]:
a = int(len(train)/10)
train_1 = train[   :a*1]
train_2 = train[a*1:a*2]
train_3 = train[a*2:a*3]
train_4 = train[a*3:a*4]
train_5 = train[a*4:a*5]
train_6 = train[a*5:a*6]
train_7 = train[a*6:a*7]
train_8 = train[a*7:a*8]
train_9 = train[a*8:a*9]
train_10 = train[a*9:]

In [23]:
print('train_1')
train_1['tokenized'] = train_1['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_2')
train_2['tokenized'] = train_2['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_3')
train_3['tokenized'] = train_3['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_4')
train_4['tokenized'] = train_4['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_5')
train_5['tokenized'] = train_5['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_6')
train_6['tokenized'] = train_6['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_7')
train_7['tokenized'] = train_7['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_8')
train_8['tokenized'] = train_8['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_9')
train_9['tokenized'] = train_9['tokenized'].apply(lambda x: nltk.word_tokenize(x))
print('train_10')
train_10['tokenized'] = train_10['tokenized'].apply(lambda x: nltk.word_tokenize(x))

train_1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_1['tokenized'] = train_1['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_2['tokenized'] = train_2['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_3['tokenized'] = train_3['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_4['tokenized'] = train_4['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_5['tokenized'] = train_5['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_6['tokenized'] = train_6['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_7['tokenized'] = train_7['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_8['tokenized'] = train_8['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_9['tokenized'] = train_9['tokenized'].apply(lambda x: nltk.word_tokenize(x))


train_10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_10['tokenized'] = train_10['tokenized'].apply(lambda x: nltk.word_tokenize(x))


In [24]:
train = pd.concat([train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8, train_9, train_10])

In [27]:
train = shuffle(train)

In [29]:
train.to_pickle("train.pkl")

In [30]:
unpickled_train = pd.read_pickle("train.pkl")  