In [203]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 13 00:16:15 2020

@author: hongjunyu
"""
import os
import pandas as pd
import numpy as np
import nltk
import string
import joblib
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split
import xgboost as xgb
from xgboost import plot_importance
import keras
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Flatten, Input, Dropout, LSTM, Activation, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
os.chdir('/Users/hongjunyu/desktop/spooky author identification')

In [204]:
def text_cleaning(text):
    
    #remove stopwords
    text = [word for word in word_tokenize(text) if word not in stopwords.words('english')]
    #remove punctuations
    text = [element for element in text if element not in string.punctuation]
    #lemmatize
    text = [lemmatizer.lemmatize(word) for word in text]
    #stemming
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    
    return text

In [205]:
#read the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
test_data['author'] = None

In [206]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [207]:
train_data['text'] = train_data['text'].apply(lambda x:text_cleaning(x))

In [208]:
#get embedding matrix
embedding_matrix = {}
f = open('glove.6b.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coef = np.asarray(values[1:], dtype='float32')
    embedding_matrix[word] = coef
f.close()

In [209]:
puncts = string.punctuation

In [210]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

def sentences_to_indices(X, word_to_index, max_len):
    
    m = X.shape[0]                                   # 样本数
    
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # 遍历每个样本
        
        # 将当前句子分割成单词，并且变成小写字母
        sentence_words = [w.lower() for w in word_tokenize(X[i]) if w not in puncts]

        j = 0
        
        # 遍历每个单词
        for w in sentence_words:
            # 将单词转换成索引
            try:
                X_indices[i, j] = word_to_index[w]
            except Exception as r:
                pass
            j += 1
    
    return X_indices

In [211]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('glove.6B.200d.txt')

In [212]:
maxLen = 256

In [213]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    
    vocab_len = len(word_to_index) + 1   # 获取整个词表的大小，加个1是因为Keras要求的
    emb_dim = word_to_vec_map["is"].shape[0]      # 获取Glove向量的维度，这个是200

    # 初始化词嵌入矩阵的维度（索引数，Glove训练维度）
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # 为每个索引设置对应的Glove向量
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # 创建Embedding层，
    # 参数trainable一定要设置为false，这样才能保证模型运行时不会随着训练改变Embedding里的词嵌入矩阵的值
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    
    # 将预训练好的词嵌入矩阵作为Embedding层的权重来初始化该层
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [233]:
def MODEL(input_shape, word_to_vec_map, word_to_index):
    
    # 将sentence_indices定义为模型的输入
    sentence_indices = Input(input_shape, dtype='int32')
    
    # 使用预训练好的词嵌入矩阵来创建一个embedding层
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # 将输入传入到embedding层中，embedding层会输出与输入索引对应的词嵌入embeddings
    embeddings = embedding_layer(sentence_indices)   

    ap = GlobalAveragePooling1D()(embeddings)
    gp = GlobalMaxPooling1D()(embeddings)
    stack = concatenate([gp,ap],axis=1)
    
    X = Dense(512)(stack)
    X = Dropout(0.2)(X)
    
    X = Dense(512)(X)
    X = Dropout(0.2)(X)
    
    X = Dense(512)(X)
    X = Dropout(0.2)(X)
    
    X = Dense(512)(X)
    X = Dropout(0.2)(X)
   
    X = Dense(3)(X)
    X = Activation('softmax')(X)
     
   
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [234]:
model = MODEL((maxLen,), word_to_vec_map, words_to_index)

In [235]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           (None, 256)          0                                            
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 256, 200)     80000200    input_28[0][0]                   
__________________________________________________________________________________________________
global_max_pooling1d_19 (Global (None, 200)          0           embedding_28[0][0]               
__________________________________________________________________________________________________
global_average_pooling1d_23 (Gl (None, 200)          0           embedding_28[0][0]               
__________________________________________________________________________________________________
concatenat

In [236]:
model.compile(loss='categorical_crossentropy', 
              optimizer=keras.optimizers.Adam(), 
              metrics=['accuracy'])

In [218]:
X = train_data['text']

In [219]:
X_indices = sentences_to_indices(X, words_to_index, max_len=256)

In [220]:
label_encoder = LabelEncoder()

In [221]:
Y = label_encoder.fit_transform(train_data['author'])

In [222]:
Y = convert_to_one_hot(Y, C = 3)

In [223]:
X_train, X_test, Y_train, Y_test = train_test_split(X_indices, Y, test_size=0.1)

In [225]:
X_test.shape

(1958, 256)

In [226]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [237]:
model.fit(X_train, 
          Y_train, 
          epochs = 400, 
          batch_size = 64, 
          validation_data = (X_test, Y_test),
          callbacks = [EarlyStopping(patience=150, monitor='val_loss')])

Train on 17621 samples, validate on 1958 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/40

KeyboardInterrupt: 