In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 150

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os

In [2]:
data_dir = '/home/wenceslai/Documents/dissasters_kaggle'

train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [3]:
train.isnull().sum() / train.shape[0] * 100
test.isnull().sum() / test.shape[0] * 100

sid          0.000000
keyword      0.796813
location    33.864542
text         0.000000
dtype: float64

In [4]:
train = train.drop('location', axis=1)
test = test.drop('location', axis=1)
train = train.drop(train[train['keyword'].isnull()].index)

In [5]:
train.nunique()

id         7552
keyword     221
text       7447
target        2
dtype: int64

In [6]:
max_word_cnt = 0

for tweet in pd.concat([test['text'], train['text']]):

    words = len(tweet.split())
    if words > max_word_cnt:
        max_word_cnt = words

max_word_cnt

31

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = max_word_cnt
tokenizer = Tokenizer(num_words=max_words, )
tokenizer.fit_on_texts(train['text'])
sequences = tokenizer.texts_to_sequences(train['text'])

word_index = tokenizer.word_index
print("unique tokens: ", len(word_index))

data = pad_sequences(sequences, maxlen=max_len)

#shuffling
indicies = np.arange(data.shape[0])
np.random.shuffle(indicies)
data = data[indicies]

labels = train['target'].values
labels = labels[indicies]

Using TensorFlow backend.
unique tokens:  22653


In [10]:
from keras import models, layers, regularizers

def get_model():
    model = models.Sequential()

    model.add(layers.Embedding(max_words, 20, input_length=max_len))
    model.add(layers.Flatten())

    #model.add(layers.LSTM(32))
    #model.add(layers.LSTM(32))

    model.add(layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-2)))
    model.add(layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(1e-2)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [12]:
from sklearn.model_selection import StratifiedKFold

n_splits = 3
epochs = 5

skf = StratifiedKFold(n_splits=n_splits)
skf.get_n_splits(data, labels)

val_acc = train_acc = top_val_acc = 0 

for train_i, val_i in skf.split(data, labels):
    X_train = data[train_i]
    X_val = data[val_i]

    y_train = labels[train_i]
    y_val = labels[val_i]

    model = get_model()

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=32,
        epochs=epochs,
        verbose=2
        )

    val_acc += sum(history.history['val_acc'])
    train_acc += sum(history.history['acc']) 
    top_val_acc += max(history.history['val_acc'])
    
print("train_acc:", train_acc / (n_splits * epochs), "val_acc:", val_acc / (n_splits * epochs))
print("\ntop_val_acc:", top_val_acc / n_splits)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 6041 samples, validate on 1511 samples
Epoch 1/5





 - 2s - loss: 0.8425 - acc: 0.5982 - val_loss: 0.6828 - val_acc: 0.6353
Epoch 2/5
 - 1s - loss: 0.6403 - acc: 0.7204 - val_loss: 0.5930 - val_acc: 0.7657
Epoch 3/5
 - 1s - loss: 0.5574 - acc: 0.7967 - val_loss: 0.5307 - val_acc: 0.7968
Epoch 4/5
 - 1s - loss: 0.5023 - acc: 0.8219 - val_loss: 0.4990 - val_acc: 0.8154
Epoch 5/5
 - 1s - loss: 0.4579 - acc: 0.8436 - val_loss: 0.4856 - val_acc: 0.8187
Train on 6041 samples, validate on 1511 samples
Epoch 1/5
 - 2s - loss: 0.8357 - acc: 0.5723 - val_loss: 0.6857 - val_acc: 0.5725
Epoch 2/5
 - 1s - loss: 0.6517 - acc: 0.6727 - val_loss: 0.6250 - val_acc: 0.6929
Epoch 3/5
 - 1s - loss: 0.6015 - acc: 0.7777 - val_loss: 0.5812 - val_acc: 0.7743
Epoch 4/5
 - 1s - loss: 0

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(data, labels, train_size=0.8)

print("train samples:", X_train.shape[0], "test_samples:", X_val.shape[0])

train samples: 7544 test_samples: 8


In [87]:
epochs = 10 #val_acc: 0.8021

model = get_model()
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=epochs,
    verbose=2
)

Train on 3776 samples, validate on 3776 samples
Epoch 1/10
 - 1s - loss: 0.2439 - acc: 0.9555 - val_loss: 0.1756 - val_acc: 0.9688
Epoch 2/10
 - 1s - loss: 0.2150 - acc: 0.9627 - val_loss: 0.1807 - val_acc: 0.9653
Epoch 3/10
 - 1s - loss: 0.2086 - acc: 0.9635 - val_loss: 0.1877 - val_acc: 0.9624
Epoch 4/10
 - 1s - loss: 0.1939 - acc: 0.9661 - val_loss: 0.1935 - val_acc: 0.9605
Epoch 5/10
 - 1s - loss: 0.1888 - acc: 0.9672 - val_loss: 0.2018 - val_acc: 0.9603
Epoch 6/10
 - 1s - loss: 0.1829 - acc: 0.9711 - val_loss: 0.2071 - val_acc: 0.9560
Epoch 7/10
 - 1s - loss: 0.1751 - acc: 0.9717 - val_loss: 0.2213 - val_acc: 0.9507
Epoch 8/10
 - 1s - loss: 0.1633 - acc: 0.9725 - val_loss: 0.2249 - val_acc: 0.9444
Epoch 9/10
 - 1s - loss: 0.1618 - acc: 0.9751 - val_loss: 0.2243 - val_acc: 0.9478
Epoch 10/10
 - 1s - loss: 0.1513 - acc: 0.9754 - val_loss: 0.2461 - val_acc: 0.9359


In [21]:
#tokenizer.fit_on_texts(train['text'])
sequences = tokenizer.texts_to_sequences(test['text'])
X_test = pad_sequences(sequences, maxlen=max_len)

preds = model.predict(X_test)

In [22]:
submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))
submission['target'] = np.round(preds).astype(int)

submission.to_csv(os.path.join(data_dir, 'sub3.csv'), index=False)