In [1]:
# connect to upload data
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import tensorflow as tf


# Get the GPU device name:
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  #raise SystemError('GPU device not found')
  print('GPU device not found')

Found GPU at: /device:GPU:0


In [3]:
!pip install emoji



In [4]:
import pandas as pd
import emoji
import re
import numpy as np


path = '' # ADAPT PATH

# manually curate split
train = pd.read_csv(path + 'DALC2_train_full.csv', delimiter='\t', header=0)

dev = pd.read_csv(path + 'DALC2_dev_full.csv', delimiter='\t', header=0)

test = pd.read_csv(path + 'DALC2_test_full.csv', delimiter='\t', header=0)


print('Number of training sentences: {:,}\n'.format(train.shape[0]))
print('Number of dev sentences: {:,}\n'.format(dev.shape[0]))
print('Number of test sentences: {:,}\n'.format(test.shape[0]))
#print(train[['text', 'explicitness', 'target']].head())
print(train[['text', 'target_aggregated']])


Number of training sentences: 6,817

Number of dev sentences: 1,205

Number of test sentences: 3,270

                                                   text  ... target_aggregated
0                                     STEM LIJST 13 URL  ...               NaN
1     @USER @USER @USER en dan nu achterste voren sp...  ...             GROUP
2                    Als je neukt voor geld alle ja URL  ...        INDIVIDUAL
3     @USER Ze doen net of ze zo schijnheilig zijn,h...  ...             GROUP
4     @USER @USER Nu is het wel zo dat zwarte mensen...  ...             GROUP
...                                                 ...  ...               ...
6812                              @USER @USER Nee hoor.  ...               NaN
6813                      @USER @USER Het is een topper  ...               NaN
6814                     Weer een ontzettend leuke klus  ...               NaN
6815  Speciaal voor @USER nog maar eens in de herhal...  ...             OTHER
6816  Filmmakers naar rechter

In [5]:
# drop NaN for TARGET
train.dropna(subset=['target_aggregated'], inplace=True)
dev.dropna(subset=['target_aggregated'], inplace=True)
test.dropna(subset=['target_aggregated'], inplace=True)

print(train.head())

                    id  ... target_a4
1  1267087494941679617  ...       NaN
2   838904123558883328  ...       NOT
3  1267575421014609920  ...       NaN
4  1062112759255576582  ...       NaN
5  1266354740021133318  ...       NaN

[5 rows x 13 columns]


In [6]:
train_labels = train.target_aggregated.values
dev_labels = dev.target_aggregated.values
test_labels = test.target_aggregated.values

# Reformatting the labels binary, 0 = not abusive, 1 = abusive
def reformat_labels(labels):
  b_labels = []
  not_count = 0
  individual_count = 0
  group_count = 0
  other_count = 0
  
  for label in labels:
    if label == 'NOT':
      not_count += 1
      b_labels.append(0)
    elif label == 'INDIVIDUAL':
      individual_count += 1
      b_labels.append(1)
    elif label == "GROUP":
      group_count += 1
      b_labels.append(2)
    else:
      other_count += 1
      b_labels.append(3)


  return b_labels, individual_count, group_count, other_count, not_count # ternary

# binary
print('Formatting train labels:')
train_labels, ind_count, group_count, other_cnt, not_count = reformat_labels(train_labels) 
print('IND: {} | GRP: {} | OTH: {} | NOT: {}'.format(ind_count, group_count, other_cnt, not_count)) 

print('Formatting dev labels:')
dev_labels, ind_count, group_count, other_cnt, not_count = reformat_labels(dev_labels) 
print('IND: {} | GRP: {} | OTH: {} | NOT: {}'.format(ind_count, group_count, other_cnt, not_count)) 

print('Formatting test labels:')
test_labels, ind_count, group_count, other_cnt, not_count = reformat_labels(test_labels) 
print('IND: {} | GRP: {} | OTH: {} | NOT: {}'.format(ind_count, group_count, other_cnt, not_count)) 


Formatting train labels:
IND: 1147 | GRP: 705 | OTH: 489 | NOT: 136
Formatting dev labels:
IND: 191 | GRP: 133 | OTH: 93 | NOT: 22
Formatting test labels:
IND: 361 | GRP: 244 | OTH: 157 | NOT: 105


In [8]:
def clean_samples(data):

  new_samples = []
  #print(data.head())

  content = list(data['text'].values)
  for tweet_message in content:
      tweet_message = tweet_message.lower()
      tweet_message = re.sub(r'(@\w+)','MENTION', tweet_message)
      tweet_message = re.sub(r'(https\S+)','URL', tweet_message)
      tweet_message = re.sub(r'[0-9]+', 'NUMBER', tweet_message)
      tweet_message = emoji.demojize(tweet_message)
      tweet_message = re.sub(r'#', '', tweet_message)
      tweet_message = re.sub(r'[(#.,\/?!@$%^&*)]', '', tweet_message)
      new_samples.append(tweet_message)

  return new_samples

## Formatting other dataframes as well
train_clean = clean_samples(train) # list
dev_clean = clean_samples(dev) # list
test_clean = clean_samples(test) # list

print(dev_clean[0:3])

['MENTION van enige zelfreflectie is bij moslims nooit enige spraken het ligt altijd aan die ander de jood of de christenhond', 'MENTION de lafbek wil medicijnen alleen voor zijn eigen', 'ik vind overigens dat protesten over de dood van die zwarte jongen geheel legitiem zijn ik keur alleen het plunderen af en de riots die erdoor werden aangezwengeld door antifa door dat gegeven krijgt de politie mijn steun weer om dit keihard aan te pakken dom dom dom van ze']


In [9]:
import nltk
nltk.download('punkt')
#from nltk import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder

#tokenizer = nltk.data.load('tokenizers/punkt/dutch.pickle')

# Tokenize tweet into words
def tokenize(text):
    return nltk.word_tokenize(text, language='dutch')
# check the function
#sample_text = 'he did not say anything  about what is going to  happen'
sample_text = 'Ik liep naar huis. Dat deed ik gisteren'
print("tokenize results :", tokenize(sample_text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
tokenize results : ['Ik', 'liep', 'naar', 'huis', '.', 'Dat', 'deed', 'ik', 'gisteren']


In [10]:
def text_prepare(text):
    text_join = ' '.join([x for x in tokenize(text)])
    text_split = [x for x in tokenize(text)]
    return text_join, text_split

text_train, text_tokenzied_train = [text_prepare(x)[0] for x in train_clean], [text_prepare(x)[1] for x in train_clean]
text_dev, text_tokenzied_dev = [text_prepare(x)[0] for x in dev_clean], [text_prepare(x)[1] for x in dev_clean],
text_test, text_tokenzied_test = [text_prepare(x)[0] for x in test_clean], [text_prepare(x)[1] for x in test_clean]

msg_lenght = [len(x) for x in text_tokenzied_train]
MAX_SEQUENCE_LENGTH = max(msg_lenght)

print(MAX_SEQUENCE_LENGTH)
##print(text_tokenzied_dev)

le = LabelEncoder()
train_label_enc = le.fit_transform(train_labels)
dev_label_enc = le.fit_transform(dev_labels)
test_label_enc = le.fit_transform(test_labels)

d_train = {'tweet':text_train,'label':train_label_enc}
d_dev = {'tweet':text_dev,'label':dev_label_enc}
d_test = {'tweet':text_test,'label':test_label_enc}

df_train = pd.DataFrame(d_train, columns=['tweet','label'])
df_dev = pd.DataFrame(d_dev, columns=['tweet','label'])
df_test = pd.DataFrame(d_test, columns=['tweet','label'])

# tokenized tweets
df_train['tweet_tokenized'] = text_tokenzied_train
df_dev['tweet_tokenized'] = text_tokenzied_dev
df_test['tweet_tokenized'] = text_tokenzied_test

# shuffle entries df
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_dev = df_dev.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_train.head())



97
                                               tweet  ...                                    tweet_tokenized
0  MENTION tja als ze die pyramide wel in hadden ...  ...  [MENTION, tja, als, ze, die, pyramide, wel, in...
1  pvdd en groenlinks spnl pvv eigenschuld en ik ...  ...  [pvdd, en, groenlinks, spnl, pvv, eigenschuld,...
2  gewone burgers worden gelijk van twitter geple...  ...  [gewone, burgers, worden, gelijk, van, twitter...
3  deze invalide vrouw zou negers hebbben neerges...  ...  [deze, invalide, vrouw, zou, negers, hebbben, ...
4  MENTION deze akwasi snapt het zelf niet eens e...  ...  [MENTION, deze, akwasi, snapt, het, zelf, niet...

[5 rows x 3 columns]


In [11]:
from keras.layers import Dropout, Dense, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn import metrics
from sklearn.utils import shuffle
import numpy as np
import matplotlib.pyplot as plt
import warnings
import logging
logging.basicConfig(level=logging.INFO)

In [12]:
# downloading the coosto model
import requests
WE_FILE = 'coosto.bin'

def download_word_embeddings():
    wordembeddings_url = 'https://github.com/coosto/dutch-word-embeddings/releases/download/v1.0/model.bin'

    r = requests.get(wordembeddings_url)

    if r.status_code == 200:
        with open(WE_FILE, 'wb') as wordembeddings_file:
            wordembeddings_file.write(
                r.content
            )
    
download_word_embeddings()

In [13]:
# loading coosto/word2vec:
import gensim

def load_word_embeddings():
    return gensim.models.KeyedVectors.load_word2vec_format(WE_FILE, binary=True)

word_model = load_word_embeddings()
#print(word_model['huis'])

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
INFO:gensim.models.utils_any2vec:loading projection weights from coosto.bin
INFO:gensim.models.utils_any2vec:loaded (250479, 300) matrix from coosto.bin


In [14]:
X_train_text, X_train_tokenized, X_dev_text, X_dev_tokenized, X_test_text, X_test_tokenized, y_train, y_dev, y_test = df_train.tweet, df_train.tweet_tokenized, df_dev.tweet, df_dev.tweet_tokenized, df_test.tweet, df_test.tweet_tokenized, df_train.label, df_dev.label, df_test.label

In [15]:
# vocab == COOSTO vocab
def _vectorize_data(data, wm):
    # turn the tokens into coosto vocab indices
    # these will be converted to embeddings in the Embedding layer
    vocab = wm.vocab
    keys = list(vocab.keys())

    final = []
    for tweet in data:
        final.append([keys.index(word) for word in tweet if vocab.get(word, None) is not None])
    return final

def vectorize_data(all_tweets, model):
    # pad so each message has equal max lenght train set.
    return pad_sequences(
        sequences=_vectorize_data(all_tweets, model),
        maxlen = MAX_SEQUENCE_LENGTH,
        padding='post')

In [None]:
def build_bilstm_target(embedding_matrix, nclasses=4, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH, EMBEDDING_DIM=300, dropout=0.5, hidden_layer = 0, lstm_node = 50):
    # Initialize a sequential model
    model = Sequential()            
    # Add embedding layer
    model.add(Embedding(
                        input_dim = embedding_matrix.shape[0],
                        output_dim = embedding_matrix.shape[1], 
                        input_length = MAX_SEQUENCE_LENGTH,
                        weights = [embedding_matrix],
                        trainable=False)
                        )

    # Add hidden layers 
    for i in range(0,hidden_layer):
        # Add a bidirectional lstm layer
        model.add(Bidirectional(LSTM(lstm_node, return_sequences=True, recurrent_dropout=0.1)))
        # Add a dropout layer after each lstm layer
        model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(lstm_node, recurrent_dropout=0.1)))
    model.add(Dropout(dropout))
    # Add the fully connected layer and relu activation
    model.add(Dense(64, activation='relu')) # 128 # 64
    # Add the output layer with softmax activation ternary
    model.add(Dense(nclasses, activation='softmax')) # ternary
#    model.add(Dense(nclasses, activation='sigmoid')) #binary


    # Compile the model using sparse_categorical_crossentropy
    model.compile(loss='sparse_categorical_crossentropy', #sparse_categorical_crossentropy
                      optimizer='adam',
                      metrics=['accuracy'])
#    # Compile the model using binary_crossentropy
#    model.compile(loss='binary_crossentropy', #sparse_categorical_crossentropy
#                      optimizer='adam',
#                      metrics=['accuracy'])

    return model


def build_model(word_model):
    return build_bilstm_target(
                        embedding_matrix=word_model.vectors
                        )

In [19]:
# vectorization 
df_train_tokenized = df_train["tweet_tokenized"].to_numpy()
df_dev_tokenized = df_dev["tweet_tokenized"].to_numpy()
df_test_tokenized = df_test["tweet_tokenized"].to_numpy()

X_train_ = vectorize_data(df_train_tokenized, word_model)
X_dev_ = vectorize_data(df_dev_tokenized, word_model)
X_test_ = vectorize_data(df_test_tokenized, word_model)
#
print(df_train_tokenized[0])
print(X_dev_[0])
print(len(X_dev_[0]))

['MENTION', 'tja', 'als', 'ze', 'die', 'pyramide', 'wel', 'in', 'hadden', 'gewild', 'hadden', 'ze', 'gisteren', 'wel', 'met', 'de', 'reserves', 'gespeeld', 'raar', 'verhaal', 'MENTION', 'schaam', 'je']
[ 25904  16122     10     42   1203    139      1    215    643   1665
    778     16     34   4033 143608   9614   3491   4033    815    198
  13785      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0]
97


In [20]:
# version 2021/12/13 - binary
model = build_model(word_model) 
model.summary()














Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 97, 300)           75143700  
                                                                 
 bidirectional (Bidirectiona  (None, 100)              140400    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 64)                6464      
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                                 
Total params: 75,290,824
Trainable params: 147,124
Non-trainable params: 75,143,700
______________________________________

In [21]:
# fit model into data
from keras.callbacks import EarlyStopping, ModelCheckpoint 
es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=3)
mc = ModelCheckpoint('path2savemodel', monitor='val_loss', mode='max', verbose=1, save_best_only=True)


history = model.fit(X_train_, y_train,
                           validation_data=(X_dev_, y_dev),
                           epochs=100,
                           batch_size=32,
                           verbose=1,
                           callbacks=[es,mc])

Epoch 1/100
Epoch 00001: val_loss improved from -inf to 1.00936, saving model to /content/gdrive/MyDrive/Teaching/20 21/IK-BA Thesis/Offensive_Language/models/target_offensive_bilstm/dalc_offensive-target_2.h5
Epoch 2/100
Epoch 00002: val_loss did not improve from 1.00936
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.00936
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.00936
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.00936
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.00936
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.00936
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.00936
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.00936
Epoch 00009: early stopping


In [22]:
print(y_test)
print(model.predict(X_test_))

0      2
1      3
2      2
3      1
4      1
      ..
862    2
863    0
864    1
865    3
866    3
Name: label, Length: 867, dtype: int64
[[2.62873899e-03 1.65285483e-01 8.22428346e-01 9.65747144e-03]
 [2.55950801e-02 1.27450945e-02 8.04196477e-01 1.57463372e-01]
 [6.04041945e-03 8.89720954e-03 4.51843917e-01 5.33218443e-01]
 ...
 [6.36407232e-04 9.98282671e-01 7.17235089e-04 3.63738858e-04]
 [4.91250038e-01 3.23871970e-01 1.01935275e-01 8.29427764e-02]
 [5.17121796e-03 1.59798760e-03 8.82649660e-01 1.10581078e-01]]


In [24]:
# categorical_cross entropy - ternary
print("\n Evaluating Model ... \n")
predicted = y_predict = np.argmax(model.predict(X_test_), axis=-1)
print(metrics.classification_report(y_test, predicted, digits=4))
print("\n")


 Evaluating Model ... 

              precision    recall  f1-score   support

           0     0.3784    0.1333    0.1972       105
           1     0.6308    0.8615    0.7283       361
           2     0.6296    0.6270    0.6283       244
           3     0.5000    0.2994    0.3745       157

    accuracy                         0.6055       867
   macro avg     0.5347    0.4803    0.4821       867
weighted avg     0.5762    0.6055    0.5718       867



