In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
df = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

In [3]:
print('size of data: %s' % (len(df['data'])))

size of data: 3759


In [4]:
X = df.data
y = df.target

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

NUM_WORDS = 20000

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 48309 unique tokens.


In [6]:
X = pad_sequences(sequences, maxlen=30000)

In [7]:
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (3759, 30000)
Shape of label tensor: (3759,)


In [8]:
from keras.utils import np_utils

y = np_utils.to_categorical(y)

In [9]:
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size = 0.25)

print (len(X_train))
print (len(X_val))
print (len(X_test))
# 60, 20, 20 split

2255
752
752


In [28]:
pickle_in = open("../HW 3/pretrained_word2vec_model","rb")
pretrained_model = pickle.load(pickle_in)

In [36]:
EMBEDDING_DIM = 300
vocabulary_size = min(len(word_index)+1, NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))


for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
        
    try:
        embedding_vector = pretrained_model[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(pretrained_model)

In [76]:
preprocessed_data = [X_train, y_train, X_val, y_val, X_test, y_test, embedding_matrix]

pickle_out = open("preprocessed_CNN", "wb")
pickle.dump(preprocessed_data, pickle_out)
pickle_out.close()

In [48]:
from keras.layers import Embedding
EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)

In [56]:
# from keras.layers import Dense, Input, GlobalMaxPooling1D
# from keras.layers import Conv1D, MaxPooling1D, Embedding
# from keras.models import Model
# from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
# from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers


from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import GlobalMaxPooling1D

sequence_length = X_train.shape[1]
filterSize = 3
num_filters = 100
dropout = 0.5
units = 32

In [68]:
model = Sequential()

model.add(Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True))
          
model.add(Conv1D(num_filters, filterSize, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout))
model.add(Dense(units, activation='relu'))
model.add(Dense(4, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))


model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 300)         6000000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 100)         90100     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                3232      
_________________________________________________________________
dense_8 (Dense)              (None, 4)                 132       
Total params: 6,093,464
Trainable params: 6,093,464
Non-trainable params: 0
____________________________________________

In [70]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy', # sparse_
              optimizer=adam,
              metrics=['acc'])
callbacks = [EarlyStopping(monitor='val_loss')]


In [71]:
model.fit(X_train, dummy_y_train, batch_size=100, epochs=10, verbose=1, validation_data=(X_val, dummy_y_val),
         callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2255 samples, validate on 752 samples
Epoch 1/10
 300/2255 [==>...........................] - ETA: 22:23 - loss: 1.5550 - acc: 0.2000

KeyboardInterrupt: 