In [None]:
import pandas as pd
import numpy as np
from keras.models import Model,Sequential,load_model
from keras.layers import Dense, Embedding, Input,LSTM,Bidirectional, GlobalMaxPool1D,GlobalMaxPooling1D,\
GlobalAveragePooling1D,Dropout,CuDNNLSTM,SpatialDropout1D,CuDNNGRU,BatchNormalization, Activation,merge, Lambda
from keras.preprocessing import text, sequence
from keras.layers.merge import concatenate
from keras import optimizers
from keras.callbacks import TensorBoard, ModelCheckpoint
import time
from collections import defaultdict
import re
import os

from keras import backend as K
import matplotlib.pyplot as plt

DIR_MAC= '/Users/haipengwu/Codings/Machine Learning/HW6'
DIR_1080="C:\\Users\\WIN10\\Codings\\Machine Learning\\HW6"

DIR = DIR_1080

MAX_LEN = 200



In [None]:
train = pd.read_csv(os.path.join(DIR,'train.csv'))
test = pd.read_csv(os.path.join(DIR,'test.csv'))

## text cleaning

In [None]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stemmer = SnowballStemmer('english')
words = stopwords.words('english')
lmtzr = WordNetLemmatizer()


word_count_dict = defaultdict(int)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)
clean_word_dict = {}
with open(os.path.join(DIR,'cleanwords.txt'), 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct



def clean_text(text):
    

    stemmed_text =[]
    lmtz_text=[]
    no_stop_text = []
    normal_text = [] # just normal text, with stopwords, not stemmed nor ltmz words
    cleaned_text =" "
    
        
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)
                
    text = re.sub("[^a-zA-Z]"," ", text) # delete all of the non-Latin words
    text = text.lower() 

    for i in text.split():                
#         if i not in words:
        normal_text.append(i)
#             lmtz_text.append(lmtzr.lemmatize(i)) #try lemmatizer
#             stemmed_text.append(stemmer.stem(i)) #try stemmer
        

    return cleaned_text.join(normal_text) # <- str must contain join, otherwise it is still empty...



if not (os.path.exists(os.path.join(DIR,'cleaned_train.csv')) and os.path.exists(os.path.join(DIR,'cleaned_test.csv'))):
    
    print("Text Cleaning Start...")
    

    list_sentences_train = train["comment_text"].fillna("no comment").values
    list_sentences_test = test["comment_text"].fillna("no comment").values

    comments = [clean_text(text) for text in list_sentences_train] 
    
    test_comments=[clean_text(text) for text in list_sentences_test]

    train['comment_text'] = comments
    test['comment_text'] = test_comments

    train.to_csv(os.path.join(DIR,'cleaned_train.csv'), index=False)
    test.to_csv(os.path.join(DIR,'cleaned_test.csv'), index=False)
    
    print("Text Cleaning Finished...")

In [None]:
train = pd.read_csv(os.path.join(DIR,'cleaned_train.csv'))
test = pd.read_csv(os.path.join(DIR,'cleaned_test.csv'))

In [None]:
list_sentence_train = train["comment_text"].fillna("no comment").values
list_sentence_test = test["comment_text"].fillna("no comment").values

## get y_train

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].values

## Tokenize the training data: give each word of a sentence a token(unique number)

In [None]:
VOCAB_SIZE = 200000

tokenizer = text.Tokenizer(num_words = VOCAB_SIZE )
tokenizer.fit_on_texts(list(list_sentence_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentence_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentence_test)

# Set the vocabulary size by the length of word_index of the tokenized list
VOCAB_SIZE = len(tokenizer.word_index)+1
print(VOCAB_SIZE)

## read the pretrained word vectors into embeddings_index dictionary. In this dict, words are the keywords, values are the pretrained word vectors.

In [None]:
embeddings_index = {}
with open('E:\\Downloads\\glove.840B.300d\\glove.840B.300d.txt', encoding='utf-8') as f:
# with open(DIR+'\\glove\\glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word =values[0]
        try:
            coefs = np.asarray(values[1:],dtype='float32')
            embeddings_index[word]=coefs
        except ValueError:
            pass
    print('Loaded %s word vectors.' % len(embeddings_index))

        

## Create a weight matrix for words in the training data

In [None]:
WORD_VECTOR_LEN = 300
embedding_matrix = np.zeros((VOCAB_SIZE, WORD_VECTOR_LEN))
for word, i in tokenizer.word_index.items(): # items() return the pair: the keyword and its value
    embedding_vector = embeddings_index.get(word) # this is the word vector of the word in the training data
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector   

## Pad each tokenized sequence to make each input X_train the same length

In [None]:
X_train = sequence.pad_sequences(list_tokenized_train,maxlen=MAX_LEN)
X_test = sequence.pad_sequences(list_tokenized_test,maxlen=MAX_LEN)

## Pseudo Label Process : split the X_train, the first 15000 entries are used for validation, while the rest are used for training. This is for the pseudo label purpose

![title](pl_capture.png)

In [None]:
X_val = X_train[:500,]
y_val = y_train[:500,]
X_train = X_train[500:,]
y_train = y_train[500:,]

## Define the toxic comments classification training model

In [None]:
RECURRENT_UNITS = 10

def tcc_model():
    model = Sequential()

    model.add(Embedding(VOCAB_SIZE, WORD_VECTOR_LEN, weights = [embedding_matrix], input_shape=(MAX_LEN, ),trainable = False))

    model.add(SpatialDropout1D(0.1))    

    model.add(Bidirectional(CuDNNGRU(RECURRENT_UNITS, return_sequences=True))) 
    model.add(Bidirectional(CuDNNGRU(RECURRENT_UNITS, return_sequences=True))) 

    model.add(GlobalMaxPool1D()) 

    model.add(Dropout(0.1))   
    model.add(Dense(144, activation="relu"))
    model.add(Dense(6, activation="sigmoid"))
    model.summary()

    return model


### an alternative solution which uses some concatencations
def tcc_model_av_rnn():
    

    input_layer = Input(shape=(MAX_LEN,))
    embedding_layer = Embedding(VOCAB_SIZE,
                                WORD_VECTOR_LEN,
                                weights=[embedding_matrix],
                                input_length=MAX_LEN,
                                trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.25)(embedding_layer)

    rnn_1 = Bidirectional(CuDNNGRU(RECURRENT_UNITS, return_sequences=True))(embedding_layer)
    rnn_2 = Bidirectional(CuDNNGRU(RECURRENT_UNITS, return_sequences=True))(rnn_1)
    x = concatenate([rnn_1, rnn_2], axis=2)

    last = Lambda(lambda t: t[:, -1], name='last')(x)
    maxpool = GlobalMaxPooling1D()(x)
#     attn = AttentionWeightedAverage()(x)
    average = GlobalAveragePooling1D()(x)

    all_views = concatenate([last, maxpool, average], axis=1)
    x = Dropout(0.5)(all_views)
    x = Dense(144, activation="relu")(x)
    output_layer = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.summary()


    return model

## Training the model

In [None]:
BATCH_SIZE = 320
EPOCHS = 10

model = tcc_model()
# model = tcc_model_av_rnn()

adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
model.compile(loss='binary_crossentropy',optimizer=adam_optimizer,metrics=['accuracy'])
# set tensorboard
NAME = "toxic_comments_classification-{}".format(int(time.time()))
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

#set modelcheckpoint

MODELSAVENAME = "\\model\\TCC_Best.h5"
checkpoint = ModelCheckpoint(DIR+"{}".format(MODELSAVENAME),monitor='val_acc', \
                                                      verbose=1, save_best_only=True, mode='max') # saves only the best ones

model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True, validation_split=0,validation_data=(X_val,y_val), callbacks=[tensorboard,checkpoint])

#load the best model
model = load_model(MODELSAVENAME)

# predict and use the pseudo labels
print("predict and use the pseudo labels...")
y_test_pl = model.predict(X_test)
# y_test_pl = np.around(y_test_pl)
print("...finished")
X_train_2nd = np.concatenate((X_train,X_test),axis = 0)
y_train_2nd = np.concatenate((y_train,y_test_pl),axis = 0)
print(X_train_2nd.shape, y_train_2nd.shape)
print("training the model with the testing data + pseudo labels...")
print("loading the best model...")
MODELSAVENAME_PL = "TCC_Best_Pseudo_Labeling.h5"

checkpoint_pl = ModelCheckpoint("C:\\Users\\WIN10\\Codings\\Machine Learning\\HW6\\model\\model\\{}".format(MODELSAVENAME_PL), monitor='val_acc', \
                                                      verbose=1, save_best_only=True, mode='max') # saves only the best ones

model.fit(X_train_2nd, y_train_2nd, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True, validation_split=0,validation_data=(X_val,y_val), callbacks=[tensorboard,checkpoint_pl])

#load the best PL model
model = load_model(MODELSAVENAME_PL)



## Make prediction

In [None]:
y_test = model.predict(X_test,batch_size = BATCH_SIZE )

In [None]:
sample_submission = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
sample_submission [list_classes] = y_test

In [None]:
sample_submission.to_csv("submission_wu.csv",index=False)

# Conclusion
## 1. Pretrained word embedding is very very important, which give +0.6 accuracy with same other configurations. The best result comes from glove.840B.300d.txt, which can be found at https://nlp.stanford.edu/projects/glove/

## 2. Text Cleaning is important, use Regular Expression: exclude the non-English words;

## 3. The model is not neccessarily complicated. Embedding, BiGRU x2, Maxpooling1D x1, Dense x2, and some dropout in between, that's it.

## 4. Pseudo Labelling works a little bit, need further investigation

## 5. Easy to overfit, so Early stoppings (both normal training and PL training) are important, how to do it automatically? need further investigation