In [1]:
# Import the libraries
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing import sequence
from keras import Sequential
from keras.utils import to_categorical
from keras.layers import Embedding, LSTM, CuDNNLSTM, Dense, Dropout, TimeDistributed, Activation, Conv1D, MaxPooling1D
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('german')

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /home/yannic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load the word list, for that no fastText vectors are possible to create
no_vector_possible = np.load('no_vector_possible.npy')

In [3]:
# Load the Dataset
df = pd.read_csv("dataset.csv")
# remove rows without label
df = df[df.label != 0]
df = df.dropna()

# Extract text and according labels
text = df['text_lower'].tolist()
labels = df['label'].tolist()

# Show number of training examples
print("Number of texts:",len(labels),"\n")

# Tokenize - Vocab to Int mapping dictionary
all_text = ' '.join(text)
# Create a list of words
words = all_text.split()
# Count all the words using Counter Method
count_words = Counter(words)

# Create the actuall mapping
total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

print("Number of words:",len(vocab_to_int))


# Remove Stopwords
set_words = set(words)
stopwords_int = [vocab_to_int[w] for w in stopwords if w in set_words]


text_int = []
for paragraph in text:
    words_that_have_vec = [word for word in paragraph.split() if word not in no_vector_possible]
    p = [vocab_to_int[w]-1 for w in words_that_have_vec]# if w not in stopwords_int]
    text_int.append(p)
    
    
# Encode labels from -1(Negativ), 0(Positiv), 1(Positiv)
labels = [int(x)-1 for x in labels]    

# One Hot encoding
encoded_labels = to_categorical(labels)


# Pad the features into constant length lists
max_words = 200
features = sequence.pad_sequences(text_int, maxlen = max_words)

# Split the set into training (8/10) and testing data (2/10)
X_train, X_test, y_train, y_test = train_test_split(
                                features, encoded_labels, test_size=0.2, shuffle= True)


# Check how many traing/ testing samples there are
print("\n\nTraining Examples:",len(X_train),"\nTesting Examples:",len(X_test))

Number of texts: 5740 

Number of words: 37191


Training Examples: 4592 
Testing Examples: 1148


In [4]:
# create embedding matrix
embed_dim = 300 
MAX_NB_WORDS = 100000
words_not_found = []

#Crteating the EmbeddingMatrixFrame
print('preparing embedding matrix...')
nb_words = min(MAX_NB_WORDS, len(vocab_to_int))
# load the word embeddings
embedding_matrix = np.load('embedding_matrix.npy')

preparing embedding matrix...


In [5]:
from keras.regularizers import L1L2
from sklearn.model_selection import cross_val_score, cross_validate
from keras.wrappers.scikit_learn import KerasClassifier


# Utilities to make kears use the GPU (somehow nessecary on my machine)
from keras import backend as K
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))


# Create a class to build the classifier
def build_classifier(cnn,number_of_cells,two_layers,bias,fastText,dropout):
    """
    cnn : [Boolean] True if the Model has the CNN-LSTM architecture, False for LSTM arch.
    number_of_cells : [int] number of cells for each LSTM layer
    two_layers : [Boolean] True if two LSTM layers are used, False if only one is used
    bias : [Boolean] True if L1 reghularization of 0.01 is used, false otherwise
    fastText : [Boolean] True if the fastText word embeddings are used, False if custom word embeddings are used
    dropout : [float] either 0.5 or 0.7 , specifies the dropout rate used in the model
    """
    
    # create a Sequential object
    model=Sequential()
    
    # Add the fast text embeddings
    if fastText:
        model.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_words, trainable=False))
    # Add custom embedding layer
    else:
        model.add(Embedding(40000, 100, input_length=200))
    
    # Add the CNN layer infront of the LSTM
    if cnn:
        model.add(Conv1D(32,5,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(MaxPooling1D(pool_size=3))
        model.add(Dropout(dropout))
    
    # Add a second LSTM layer that has a return sequence for the next LSTM layer
    if two_layers:
        model.add(CuDNNLSTM(number_of_cells,return_sequences=True))
        model.add(Dropout(dropout))
    # if bias is true, add a LSTM layer with L1 regularization
    if bias:
        model.add(CuDNNLSTM(number_of_cells,bias_regularizer=L1L2(l1=0.01,l2=0.0)))
    # otherwise add a normal LSTM layer
    else:
        model.add(CuDNNLSTM(number_of_cells))
    # Add a Dense output layer with a sigmoid activation function
    model.add(Dense(3))
    model.add(Activation('sigmoid'))
  
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# list if the differnet model configurations and the configuration key
parameter_lists = []
parameter_lists.append([False,100,False,False,False,0.5,'11'])
parameter_lists.append([False,100,False,False,False,0.7,'12'])
parameter_lists.append([False,100,True,False,False,0.5,'21'])
parameter_lists.append([False,100,True,False,False,0.7,'22'])
parameter_lists.append([False,100,True,True,True,0.5,'31'])
parameter_lists.append([False,100,True,True,True,0.7,'32'])
parameter_lists.append([False,30,True,True,True,0.5,'41'])
parameter_lists.append([False,30,True,True,True,0.7,'42'])
parameter_lists.append([True,100,False,True,False,0.5,'51'])
parameter_lists.append([True,100,False,True,False,0.7,'52'])
parameter_lists.append([True,30,True,True,False,0.5,'61'])
parameter_lists.append([True,30,True,True,False,0.7,'62'])
parameter_lists.append([True,30,True,True,True,0.5,'71'])
parameter_lists.append([True,30,True,True,True,0.7,'72'])
parameter_lists.append([True,30,False,True,True,0.5,'81'])
parameter_lists.append([True,30,False,True,True,0.7,'82'])

# list in which we will store the accutacy of the differnet configurations
accuracies_list = {}
# iterate over the parameter configurations
for param in parameter_lists:
    # build a classifier object with the specific partameters
    classifier = KerasClassifier(build_fn=build_classifier,
                                cnn = param[0],
                                number_of_cells = param[1],
                                two_layers = param[2],
                                bias = param[3],
                                fastText = param[4],
                                dropout = param[5],
                                batch_size =16, epochs=6)
    
    # use cross validation to evaluate the models
    accuracies = cross_validate(estimator = classifier, X = X_train, y = y_train, cv = 10)
    #calculate the mean accuracy and the standard deviation
    mean = accuracies['test_score'].mean()
    variance = accuracies['test_score'].std()
    
    # user output that displays the Mean acc+std
    print("\n",param[6],":")
    print('  Mean Accuracy:{} (+-{})'.format(round(mean,2),round(variance,2)))
    
    # Add the accuracy to the param list
    accuracies_list[param[6]] = accuracies
    
    # Delete the model to save memeory
    K.clear_session()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 11 :
  Mean Accuracy:0.51 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epo

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 21 :
  Mean Accuracy:0.52 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 2

Epoch 5/6
Epoch 6/6

 31 :
  Mean Accuracy:0.54 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 32 :
  Mean Accuracy:0.53 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6


Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 41 :
  Mean Accuracy:0.53 (+-0.03)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epo

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 51 :
  Mean Accuracy:0.51 (+-0.03)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 52 :
  Mean Accuracy:0.51 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 

Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 62 :
  Mean Accuracy:0.51 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epo

Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 72 :
  Mean Accuracy:0.52 (+-0.02)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

 81 :
  Mean Accuracy:0.54 (+-0.03)
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 

In [6]:
# print out all of the accuracys + std
for accuracies, param in zip(accuracies_list.values(),parameter_lists):
    mean = accuracies['test_score'].mean()
    variance = accuracies['test_score'].std()
    print("\n",param[6],":")
    print('  Mean Accuracy:{} (+-{})'.format(round(mean,2),round(variance,2)))


 11 :
  Mean Accuracy:0.51 (+-0.02)

 12 :
  Mean Accuracy:0.51 (+-0.02)

 21 :
  Mean Accuracy:0.52 (+-0.02)

 22 :
  Mean Accuracy:0.52 (+-0.02)

 31 :
  Mean Accuracy:0.54 (+-0.02)

 32 :
  Mean Accuracy:0.53 (+-0.02)

 41 :
  Mean Accuracy:0.53 (+-0.03)

 42 :
  Mean Accuracy:0.54 (+-0.02)

 51 :
  Mean Accuracy:0.51 (+-0.03)

 52 :
  Mean Accuracy:0.51 (+-0.02)

 61 :
  Mean Accuracy:0.51 (+-0.02)

 62 :
  Mean Accuracy:0.51 (+-0.02)

 71 :
  Mean Accuracy:0.54 (+-0.02)

 72 :
  Mean Accuracy:0.52 (+-0.02)

 81 :
  Mean Accuracy:0.54 (+-0.03)

 82 :
  Mean Accuracy:0.53 (+-0.02)
