In [None]:
# Import the libraries
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing import sequence
from keras import Sequential
from keras.utils import to_categorical
from keras.layers import Embedding, LSTM, CuDNNLSTM, Dense, Dropout, TimeDistributed, Activation, Conv1D, MaxPooling1D
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('german')

In [None]:
no_vector_possible = np.load('no_vector_possible.npy')

# Preprocessing

In [None]:
# Load the Dataset
df = pd.read_csv("dataset.csv")
# remove rows without label
df = df[df.label != 0]
df = df.dropna()

# Extract text and according labels
text = df['text_lower'].tolist()
labels = df['label'].tolist()

# Show number of training examples
print("Number of texts:",len(labels),"\n")

# Tokenize - Vocab to Int mapping dictionary
all_text = ' '.join(text)
# Create a list of words
words = all_text.split()
# Count all the words using Counter Method
count_words = Counter(words)

# Create the actuall mapping
total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

print("Number of words:",len(vocab_to_int))


# Remove Stopwords
set_words = set(words)
stopwords_int = [vocab_to_int[w] for w in stopwords if w in set_words]


text_int = []
for paragraph in text:
    words_that_have_vec = [word for word in paragraph.split() if word not in no_vector_possible]
    p = [vocab_to_int[w]-1 for w in words_that_have_vec]# if w not in stopwords_int]
    text_int.append(p)
    
    
# Encode labels from -1(Negativ), 0(Positiv), 1(Positiv)
labels = [int(x)-1 for x in labels]    

# One Hot encoding
encoded_labels = to_categorical(labels)


# Pad the features into constant length lists
max_words = 200
features = sequence.pad_sequences(text_int, maxlen = max_words)

# Split the set into training (8/10) and testing data (2/10)
X_train, X_test, y_train, y_test = train_test_split(
                                features, encoded_labels, test_size=0.2, shuffle= True)


# Check how many traing/ testing samples there are
print("\n\nTraining Examples:",len(X_train),"\nTesting Examples:",len(X_test))

# Load  word vectors

In [None]:
# create embedding matrix
embed_dim = 300 
MAX_NB_WORDS = 100000
words_not_found = []

#Crteating the EmbeddingMatrixFrame
print('preparing embedding matrix...')
nb_words = min(MAX_NB_WORDS, len(vocab_to_int))
embedding_matrix = np.load('embedding_matrix.npy')

# Build the model

In [None]:
from keras.regularizers import L1L2
from keras import backend as K
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

# Model architecture
def create_model(dropout):
    model=Sequential()
    model.add(Embedding(40000, 100, input_length=200))
    model.add(CuDNNLSTM(200))
    model.add(Dropout(dropout))
    model.add(Dense(3, activation='softmax'))
    print(model.summary())
    return model

# save the hist created by the different models
def save_hist(hist,key):
    acc = hist.history['acc']
    vall_acc = hist.history['val_acc']
    loss = hist.history['loss']
    val_loss = hist.history['val_loss']
    hist= [acc,vall_acc,loss,val_loss]
    np.save("{}_init_model".format(key),np.array(hist))


hist_list = []
# for each of the dropout rates
for dropout in [0.0, 0.2, 0.5, 0.7]:
    # Create model
    model = create_model(dropout)
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Fit model
    hist = model.fit(X_train, y_train,
          validation_split=0.10,
          batch_size=16,
          epochs=30)
    
    # append the history to the list 
    hist_list.append(hist)
    save_hist(hist,dropout)
    
    # Evaluation of the model
    print("Calculating Accuracy...")
    score, acc = model.evaluate(X_test, y_test, verbose = 1)
    print("\nAccuracy: {} - ({})".format(round(acc,2), acc))
    K.clear_session()

# Plot the accuracys and model loss

In [None]:
# Creating the Dropout graphs

fig, ax = plt.subplots(1,2,figsize=(13,5))
ax0 = ax[0]
ax1 = ax[1]

for hist_obj in hist_list:
    
    z = np.polyfit(range(len(hist_obj.history['val_acc'])), hist_obj.history['val_acc'], 3)
    f = np.poly1d(z)
    
    x_new = np.linspace(0, 30)
    y_new = f(x_new)
    
    
    #ax0.plot(hist_obj.history['acc'])
    #ax0.plot(hist_obj.history['val_acc'])
    ax0.plot(x_new,y_new)
    
    
    z = np.polyfit(range(len(hist_obj.history['val_loss'])), hist_obj.history['val_loss'], 3)
    f = np.poly1d(z)
    
    x_new = np.linspace(0, 30)
    y_new = f(x_new)
    
    
    # summarize history for loss
    #ax1.plot(hist_obj.history['loss'])
    #ax1.plot(hist_obj.history['val_loss'])
    ax1.plot(x_new,y_new)

    
ax0.set_title('model accuracy')
ax0.set_ylabel('accuracy')
ax0.set_xlabel('epoch')
ax0.legend(["0.0", "0.2", "0.5", "0.7"], title="Dropout rate")

    
ax1.set_title('model loss')
ax1.set_ylabel('loss')
ax1.set_xlabel('epoch')
ax1.legend(["0.0", "0.2", "0.5", "0.7"], title="Dropout rate")
    
plt.show()