# Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import json

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Mount drive - if you are using Google Colab

In [14]:
# choose where you want to run the model
run = 'local' # colab or local
if run == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')

# Load data

In [2]:
# Define path
if run == 'colab':
    base = '/content/drive/My Drive/Hatefulle Ytringer Models/'
else:
    base = './'

# Load data
unlabel = pd.read_csv(base+'data/data.csv', encoding = "UTF-8")
unlabel.head()

In [3]:
# drop answer column
unlabel = unlabel[['post_id', 'comment_id', 'author_id', 'text']]

# drop null rows
unlabel = unlabel.dropna()
unlabel

In [17]:
def process_tweet(df):
    '''
    Input: 
        df: a dataframe containing a column 'text' of strings of tweets
    Output:
        df with a column 'tweets_clean'
    
    '''
    #remove URL
    df['tweet_proc'] = df['text'].str.replace(r'http(\S)+', r'')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'http ...', r'')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'http', r'')
    df[df['tweet_proc'].str.contains(r'http')]

    # remove RT, @
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
    df[df['tweet_proc'].str.contains(r'RT[ ]?@')]
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'@[\S]+',r'')

    #remove &, < og >
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&amp;?',r'og')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&lt;',r'<')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'&gt;',r'>')

    # remove extra space
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'[ ]{2, }',r' ')

    # insert space between punctuation marks
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
    df['tweet_proc'] = df['tweet_proc'].str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

    # lower case and strip white spaces at both ends
    df['tweet_proc'] = df['tweet_proc'].str.lower()
    df['tweet_proc'] = df['tweet_proc'].str.strip()

    return df

# Clean text

In [18]:
unlabel = process_tweet(unlabel)

# Convert emoji into words and remove non-alphabetic characters

In [19]:
unlabel['tweet_proc'] = unlabel['tweet_proc'].str.replace(':-\)', 'smile')
unlabel['tweet_proc'] = unlabel['tweet_proc'].str.replace(':-\(', 'trist')
unlabel['tweet_proc'] = unlabel['tweet_proc'].str.replace(r'[^a-zåøæ ]', '')

# Remove stop words

In [20]:
stop_words = stopwords.words('norwegian')
stop_words.remove('ikke')
stop_words.remove('ikkje')

unlabel['tweet_proc'] = unlabel['tweet_proc'].apply(lambda x:' '.join(w for w in x.split() if w not in stop_words))

# Load trained tokenizer

In [4]:
with open(base +'tokenizer/tokenizer.json_16102021_v1') as f:
    data = json.load(f)
    tokenizer_trained = tokenizer_from_json(data)

    vocab_size = len(tokenizer_trained.word_index) + 1  # Adding 1 because of reserved 0 index
print('vocab_size: ', vocab_size)

# Convert words into sequences

In [23]:
unseen = tokenizer_trained.texts_to_sequences(unlabel['tweet_proc'].values)
unseen = pad_sequences(unseen, maxlen=128, padding='post', truncating='post')

# Load trained model

In [24]:
def create_ann_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 28))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = create_ann_model()

checkpoint_filepath = base+'model/final_model.hdf5'
model.load_weights(checkpoint_filepath)

# Classify unlabeled data using trained model

In [5]:
yhat = model.predict(unseen)
yhat = [1 if y>0.55 else 0 for y in yhat]
unlabel['Result'] = yhat
unlabel

In [6]:
# Number of cases in predicted in each class
unlabel['Result'].value_counts()

In [None]:
# save the result 
unlabel.to_csv(base+'data/data_classified.csv', index=False)