In [1]:
import pandas as pd
import numpy as np
# tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from tqdm import tqdm
import os
import gc
import time
tqdm.pandas()


In [2]:
max_features =20000
maxlen = 200
embed_size = 300

In [3]:
train = pd.read_csv("./input/train.csv",encoding='gbk',nrows=1000)
test = pd.read_csv("./input/test.csv",encoding='gbk',nrows=1000)
data = pd.concat([train,test]).reset_index(drop=True)

In [4]:
# 加载数据
train = pd.read_csv("./input/train_chi.csv",encoding='gbk')
# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(list(data['text']))
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['text']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = list(train['target'])

x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3,random_state=0)
# del train
# gc.collect()


In [5]:
# Our Deeds are the Reason of this earthquake May ALLAH Forgive us all

In [6]:
['Our','Deeds',0,0,0]

['Our', 'Deeds', 0, 0, 0]

In [7]:
x_test = tokenizer.texts_to_sequences(list(test['text']))
x_test = pad_sequences(x_test,maxlen=maxlen) # padding

In [8]:
def build_model(embedding_matrix=None):
    inp = Input(shape=(maxlen,))
    if embedding_matrix is None:
        x = Embedding(max_features, embed_size)(inp)
    else:
        x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
#     x = GlobalMeanPool1D()(x)
    x = Dense(128, activation="relu")(x)   
    x = Dense(128, activation="relu")(x)  
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [9]:
from tensorflow import keras

In [14]:
y_train = np.array(y_train)
y_val = np.array(y_val)

In [15]:
model = build_model()
history = model.fit(x_train, y_train, batch_size=512, epochs=5, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
def build_matrix(embeddings_index,word_index):
    embedding_matrix = np.zeros((max_features, 300))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        try:
     
            embedding_vector = embeddings_index[word]
        except:
           
            embedding_vector = embeddings_index["unknown"]
        if embedding_vector is not None:
            # 保证embedding_matrix行的向量与word_index中序号一致
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [17]:
#coding=utf-8
EMBEDDING_FILE = './pretrain/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE,encoding='utf-8'))
glove_embedding_matrix = build_matrix(embeddings_index,tokenizer.word_index)


100%|██████████████████████████████████████████████████████████████████████████| 8221/8221 [00:00<00:00, 183177.72it/s]


In [18]:
model = build_model(glove_embedding_matrix)
history = model.fit(x_train, y_train, batch_size=512, epochs=5, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### lstm的五折

In [19]:
from sklearn.model_selection import KFold
skf = KFold(n_splits=5, random_state=1017, shuffle=True)
score = []
count = 0
oof_pred = np.zeros((x_train.shape[0],1))
sub = np.zeros((x_test.shape[0],1))
if not os.path.exists("model"):
    os.mkdir("model")
for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):

    gc.collect()

    x1_tr, x1_va = np.array(x_train)[train_index], np.array(x_train)[test_index]
    y_tr, y_va = np.array(y_train)[train_index], np.array(y_train)[test_index]
    uid_tr, uid_va = train['id'][train_index], train['id'][test_index]
    model = build_model(glove_embedding_matrix)
    model.fit(x1_tr, y_tr, batch_size=512, epochs=5, validation_data=(x1_va, y_va))
    oof_pred[test_index] = model.predict([x1_va],batch_size=512,verbose=1)
    sub += model.predict([x_test],batch_size=512,verbose=1)/skf.n_splits

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
test['target']=sub

In [21]:
test.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0.654061
1,2,,,"Heard about #earthquake is different cities, s...",0.813033
2,3,,,"there is a forest fire at spot pond, geese are...",0.903884
3,9,,,Apocalypse lighting. #Spokane #wildfires,0.780918
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0.958763
