In [1]:
import pandas as pd
import numpy as np
from keras import regularizers
from keras.models import Model
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
PATH = '../../data/'
stopword = set(stopwords.words("english"))
tok = TweetTokenizer()

train = pd.read_csv('a.csv')
test = pd.read_csv('a.csv')

train_sentence = train['A']
test_sentence = test['A']

def clean(comment):
    text = tok.tokenize(comment)
    text = [word for word in text if word not in stopword]
    return ' '.join(text)

text_length = pd.concat([train_sentence.apply(lambda x: len(x.split())),\
                         test_sentence.apply(lambda x: len(x.split()))])

mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(test.shape)
print(mean_length)
print(std_length)

(3, 7)
(3, 7)
1.33333333333
0.516397779494


In [3]:
# config
MAX_FEATURES = 100000 # max num of words
MAX_LEN = np.round(mean_length + 3*std_length).astype(int) # max sequence length
EMBED_SIZE = 6 # embedding size
LSTM_UNITS = 50 # LSTM hidden layer unit number
DENSE_UNITS = 50
DROPOUT = 0.3 # dropout rate
BATCH_SIZE = 32
EPOCHS = 2
EMBEDDING_FILE = 'glove.6B.50d.txt'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(MAX_LEN)

3


In [4]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(pd.concat([train_sentence, test_sentence]).values)
tokenized_train = tokenizer.texts_to_sequences(train_sentence.values)
tokenized_test = tokenizer.texts_to_sequences(test_sentence.values)

X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y = train[label_cols].values
X_test = sequence.pad_sequences(tokenized_test, maxlen=MAX_LEN)

print(tokenized_train)

[[2, 1], [1], [1]]


In [10]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))
    all_embs = np.stack(embeddings_index.values())
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (nb_words+1, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix

print('done')

done


In [17]:
from keras import backend as K

file_path = '../../model/lstm_best.hdf5'
sample_submission_file_path = PATH + 'sample_submission.csv'

print('getting model')
embedding_matrix = get_embedding_matrix(EMBEDDING_FILE, EMBED_SIZE, MAX_FEATURES, tokenizer)
input = Input(shape=(MAX_LEN, ))
x = Embedding(3, 6, weights=[embedding_matrix], trainable=False, name='ss')(input)
x = Bidirectional(LSTM(2, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(6)(x)
model = Model(inputs=input, outputs=x)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(X_train, y)
model.fit(X_train, y)

dense1_layer_model = Model(inputs=model.input, outputs=model.get_layer('ss').output)  
dense1_output = dense1_layer_model.predict(X_train)

print(X_train)
print(dense1_output)

print('done')

getting model
[[0 2 1]
 [0 0 1]
 [0 0 1]] [[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
Epoch 1/1
[[0 2 1]
 [0 0 1]
 [0 0 1]]
[[[  7.44309902e-01   1.32505846e+00   2.24079713e-01   3.40721667e-01
     5.98348677e-01   1.12650180e+00]
  [  9.99879907e-04   1.00099993e+00   9.99000013e-01   9.99000013e-01
     9.99000013e-01   9.99000013e-01]
  [  9.99990269e-04   9.99990734e-04  -9.99995391e-04   9.99975484e-04
    -9.99986776e-04  -9.99992248e-04]]

 [[  7.44309902e-01   1.32505846e+00   2.24079713e-01   3.40721667e-01
     5.98348677e-01   1.12650180e+00]
  [  7.44309902e-01   1.32505846e+00   2.24079713e-01   3.40721667e-01
     5.98348677e-01   1.12650180e+00]
  [  9.99990269e-04   9.99990734e-04  -9.99995391e-04   9.99975484e-04
    -9.99986776e-04  -9.99992248e-04]]

 [[  7.44309902e-01   1.32505846e+00   2.24079713e-01   3.40721667e-01
     5.98348677e-01   1.12650180e+00]
  [  7.44309902e-01   1.32505846e+00   2.24079713e-01   3.40721667e-01
     5.98348677e-01   1.12650180e+00

In [117]:
from keras import backend as K
kvar = K.variable(X_train, dtype='float32')
K.eval(kvar)

array([[ 0.,  2.,  1.],
       [ 0.,  0.,  1.]], dtype=float32)

In [10]:
from textblob import TextBlob

b = TextBlob('i am fuckking')
print(str(b.correct()).lower().split())

['i', 'am', 'fucking']


In [2]:
class f:
    a = 1
    b = 2
    
print(f.a)

1


In [2]:
import pandas as pd
import numpy as np

x = np.array([[1,2,3],[4,5,6]])
y = pd.DataFrame(x, columns=['a','b','c'])
print(y)

   a  b  c
0  1  2  3
1  4  5  6
