## Import related libraries

In [None]:
# ALL IMPORTS 
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import nltk

import numpy as np
# # Keras
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# NLTK
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np

# Import data and preprocess
import pandas as pd
import os
from os import listdir
from sklearn.preprocessing import MinMaxScaler

import random
import tensorflow as tf
import inflect as inflect
import pandas_profiling
from sklearn.utils import resample
from sklearn.model_selection import KFold
from heapq import heappush, heappushpop
from sklearn.metrics import *

nltk.download('stopwords')
pd.set_option('display.max_columns', None)
tf.config.experimental_run_functions_eagerly(True)

## Download Sentencepiece models

In [None]:
os.system('wget -P ../sentencepiece/ https://nlp.h-its.org/bpemb/multi/multi.wiki.bpe.vs1000000.d300.w2v.bin.tar.gz')
os.system('wget -P ../sentencepiece/ https://nlp.h-its.org/bpemb/multi/multi.wiki.bpe.vs1000000.model')
os.system('wget -P ../sentencepiece/ https://nlp.h-its.org/bpemb/multi/multi.wiki.bpe.vs1000000.vocab')
os.system('tar -xzvf ../sentencepiece/multi.wiki.bpe.vs1000000.d300.w2v.bin.tar.gz -C ../sentencepiece/')

## Import Sentecepiece model and generate embedding matrix

In [None]:
import sentencepiece as spm
from gensim.models import KeyedVectors

# IMPORT EMBEDDING DICT
sp = spm.SentencePieceProcessor()
sp.Load("../sentencepiece/multi.wiki.bpe.vs1000000.model")

print('Indexing word vectors.')
EMBEDDING_DIM = 300
max_len = 64

word2vec_file = "../sentencepiece/multi.wiki.bpe.vs1000000.d300.w2v.bin"
vecs = KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

embedding_matrix = np.zeros((len(vecs.vocab)+1, EMBEDDING_DIM))

for vocab in vecs.vocab:
    embedding_matrix[sp.PieceToId(vocab)] = vecs[vocab]
print(embedding_matrix.shape)

## Define preprocessing methods

In [None]:
# CLEAN TEXT METHOD
def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    words = text.split()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return ' '.join(new_words)
    
def clean_text(text):
    # Convert I and İ into lowercase
    lower_map = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    }

    text = text.translate(lower_map)
    
    ## Remove puncuation
    text = text.translate(string.punctuation)

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    stops = set(stopwords.words("english")).union(stopwords.words("turkish"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\n', ' ', text) 
    return text


## Define Statistical Analysis Method for dataset features

In [None]:
# CALCULATE STATS ABOUT DATASET
def calc_seq_stats(data):
    avg_bio, avg_name, avg_username, avg_location, avg_url = 0, 0, 0, 0, 0
    max_bio, max_name, max_username, max_location, max_url = 0, 0, 0, 0, 0
    bio_count, name_count, username_count, location_count, url_count = [], [], [], [], []
    for bio in data['bio']:
        bio_count.append(len(bio.split()))
        max_bio = len(bio.split()) if len(bio.split()) > max_bio else max_bio
        avg_bio += len(bio.split())
    for name in data['name']:
        name_count.append(len(name.split()))
        max_name = len(name.split()) if len(name.split()) > max_name else max_name
        avg_name += len(name.split())
    for username in data['username']:
        username_count.append(len(username.split()))
        max_username = len(username.split()) if len(username.split()) > max_username else max_username
        avg_username += len(username.split())
    for location in data['location']:
        location_count.append(len(location.split()))
        max_location = len(location.split()) if len(location.split()) > max_location else max_location
        avg_location += len(location.split())
    for url in data['url']:
        url_count.append(len(url.split()))
        max_url = len(url.split()) if len(url.split()) > max_url else max_url
        avg_url += len(url.split())

    avg_bio /= len(data['bio'])
    avg_name /= len(data['name'])
    avg_username /= len(data['username'])
    avg_location /= len(data['location'])
    avg_url /= len(data['url'])

    print('Bio -> mean: {}, std: {}\n'  \
          'Name -> mean: {}, std: {}\n' \
          'Username -> mean: {}, std: {}\n' \
          'Location -> mean: {}, std: {}\n' \
          'Url -> mean: {}, std: {}\n'.format(
              np.mean(np.asarray(bio_count)), np.std(np.asarray(bio_count)),
              np.mean(np.asarray(name_count)), np.std(np.asarray(name_count)),
              np.mean(np.asarray(username_count)), np.std(np.asarray(username_count)),
              np.mean(np.asarray(location_count)), np.std(np.asarray(location_count)),
              np.mean(np.asarray(url_count)), np.std(np.asarray(url_count))
          ))

    print('Average Sequence Lengths:\nBio: {}\nName: {}\nUsername: {}\nLocation: {}\nUrl: {}\n'.format(avg_bio, avg_name, avg_username, avg_location, avg_url))
    print('Max Sequence Lengths:\nBio: {}\nName: {}\nUsername: {}\nLocation: {}\nUrl: {}\n'.format(max_bio, max_name, max_username, max_location, max_url))

    print('---------------Numeric Features Analysis------------')
    pd.set_option('display.max_colwidth', -1)
    pd.set_option('display.notebook_repr_html', True)
    pd.set_option('precision', 2)
    print('---------Max values-----------')
    print(data[['tweets', 'following', 'followers', 'likes', 'media']].max())
    print('---------Min values-----------')
    print(data[['tweets', 'following', 'followers', 'likes', 'media']].min())
    print('---------Average values-----------')
    print(data[['tweets', 'following', 'followers', 'likes', 'media']].mean())
    print('---------Standard Deviation values-----------')
    print(data[['tweets', 'following', 'followers', 'likes', 'media']].std(ddof=1))
    print('-----------------------------------------------------------')


## Import Data and Preprocess

In [None]:

# IMPORT DATA INTO PANDAS DATAFRAME 
dataset_path = '../dataset/dataset.csv'

data = pd.read_csv(dataset_path)
print('Data shape: ', data.shape)

# # Discard not labeled data
data = data.loc[(data['label'] == '0') | (data['label'] == '1')]
# Replace null values with empty string
data = data.replace(np.nan, '', regex=True)
# Make sure to truncate rows with corrupted labels.

print(len(alo))
data = data.loc[(data['label'] == '0') | (data['label'] == '1')]

# Make sure that duplicate users eliminated
#################################################
# looking at stats
# usefull to inspect and explain dataset
# pandas_profiling.ProfileReport(data)
#################################################
print(data['label'].value_counts())

# Eliminate duplicate rows using username column
usernames = data["username"]
duplicate_usernames = pd.concat(g for _, g in data.groupby('username') if len(g) > 1)
duplicate_usernames = duplicate_usernames.drop_duplicates(subset='username', keep='first')

data = data.drop_duplicates(subset='username', keep='first')
print('------After Duplicate Elimination------')
print('Duplicated account number: ', duplicate_usernames.shape[0])
print(':::Data Values:::')
print('Data shape: ', data.shape)
print(data['label'].value_counts())
orig_data = data.copy()
print(orig_data.shape)

# Clean Text Features
data['bio'] = data['bio'].map(lambda x: replace_numbers(clean_text(x)))
data['name'] = data['name'].map(lambda x: replace_numbers(clean_text(x)))
# data['username'] = data['username'].map(lambda x: clean_text(x))
data['location'] = data['location'].map(lambda x: replace_numbers(clean_text(x)))
data['url'] = data['url'].map(lambda x: replace_numbers(clean_text(x.split('com')[0])))

# Interpolate(Scale) Number Features
scaler = MinMaxScaler()
data[['tweets']] = scaler.fit_transform(data[['tweets']])
data[['following']] = scaler.fit_transform(data[['following']])
data[['followers']] = scaler.fit_transform(data[['followers']])
data[['likes']] = scaler.fit_transform(data[['likes']])
data[['media']] = scaler.fit_transform(data[['media']])

data['label'] = data['label'].astype('int32')


In [None]:
# PRINT STATS
calc_seq_stats(orig_data)

## Define postprocess method

In [None]:
def postProcessDataset(input_data):
    combined_text_features = np.asarray(['{} {} {} {}'.format(bio,name,location,url) for bio,name,location,url in zip(input_data['bio'], input_data['name'], input_data['location'], input_data['url'])]) 
    text_data_vec = tf.keras.preprocessing.sequence.pad_sequences([sp.EncodeAsIds(x) for x in combined_text_features], maxlen=max_len)
    number_data_vec = np.asarray([[x, y, z, t, w] for x, y, z, t, w in zip(input_data['tweets'], input_data['following'], input_data['followers'], input_data['likes'], input_data['media'])])
    return text_data_vec, number_data_vec

## Define shortcut variables for Tensorflow units

In [None]:
# TENSORFLOW CODES
Sequential = tf.keras.models.Sequential
Dense = tf.keras.layers.Dense
Dropout = tf.keras.layers.Dropout
Flatten = tf.keras.layers.Flatten
Conv2D = tf.keras.layers.Conv2D
Embedding = tf.keras.layers.Embedding
Bidirectional = tf.keras.layers.Bidirectional
LSTM = tf.keras.layers.LSTM
Model = tf.keras.Model
Sequential = tf.keras.Sequential

In [None]:
# MULTILAYER PERCEPTRON
def create_mlp(dim):
    # define our MLP network
    model = Sequential()
    model.add(Dense(16, input_dim=dim, activation="relu"))
    model.add(Dropout(rate=0.5))
    model.add(Dense(32, activation="relu"))


    # return our model
    return model

## Define the model

In [None]:
#BUILD MODEL
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.embedding1 = Embedding(len(vecs.vocab)+1, EMBEDDING_DIM, input_length=max_len,
                                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                    trainable=False)
        self.lstm1 = Bidirectional(LSTM(64))
        self.dropout = Dropout(rate=0.5)
        self.mlp = create_mlp(10)
        self.d1 = Dense(16, activation='relu')
        self.d3 = Dense(1, activation='sigmoid')

    def call(self, x, y):
        x = self.embedding1(x)
        x = self.lstm1(x)
        x = self.dropout(x)
        x = self.d1(x)

        y = self.mlp(y)

        z = tf.concat([x, y], -1)

        return self.d3(z)

## Define train and test step methods

In [None]:
@tf.function
def train_step(data1, data2, labels):
    with tf.GradientTape() as tape:
        predictions = model(data1, data2)
        loss = loss_object(labels, predictions)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(grads_and_vars=list(zip(gradients, model.trainable_variables)))

    train_loss(loss)
    train_accuracy(labels, predictions)

    tf.summary.scalar('loss', train_loss.result(), step=optimizer.iterations)
    tf.summary.scalar('accuracy', train_accuracy.result(), step=optimizer.iterations)
    
@tf.function
def test_step(data1, data2, labels, step_num):
    predictions = model(data1, data2)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

    tf.summary.scalar('loss', test_loss.result(), step=step_num)
    tf.summary.scalar('accuracy', test_accuracy.result(), step=step_num)
  

## Train model with k-fold where k=7

In [None]:
model_name = 'adaptive_adam_128_batch_100_epoch_equal_feature_weights'
kf = KFold(n_splits=7, random_state=None, shuffle=True)
k_count = 0
for train_index, test_index in kf.split(range(len(data.index))):
    # CREATE AN INSTANCE OF THE MODEL
    model = MyModel()

    # Setting learning rate with learning rate decay function
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.001
    end_learning_rate = 0.0001
    decay_steps = 25
    learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5)

    loss_object = tf.keras.losses.BinaryCrossentropy()
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')
    # -------------------------------------------------------------------
    best_accuracies, best_accuracies_epoch = [0,0,0,0,0], [0,0,0,0,0]
    k_count += 1
    train_data = data.iloc[list(train_index), :] 
    test_data = data.iloc[list(test_index), :] 

    tf.io.gfile.makedirs('../models/model_{}/k_{}'.format(model_name, k_count))

    train_data, validation_data = np.split(train_data.sample(frac=1), [int(.8*len(train_data))])

    train_labels = tf.constant(train_data['label'].to_numpy().reshape(train_data['label'].shape[0], 1))
    test_labels = tf.constant(test_data['label'].to_numpy().reshape(test_data['label'].shape[0], 1))
    validation_labels = tf.constant(validation_data['label'].to_numpy().reshape(validation_data['label'].shape[0], 1))

    train_text_data_vec, train_number_data_vec = postProcessDataset(train_data)
    validation_text_data_vec, validation_number_data_vec = postProcessDataset(validation_data)
    test_text_data_vec, test_number_data_vec = postProcessDataset(test_data)

    train_ds = tf.data.Dataset.from_tensor_slices((train_text_data_vec, train_number_data_vec, train_labels)).shuffle(len(train_labels)).batch(128)
    validation_ds = tf.data.Dataset.from_tensor_slices((validation_text_data_vec, validation_number_data_vec, validation_labels)).shuffle(len(validation_labels)).batch(128)
    test_ds = tf.data.Dataset.from_tensor_slices((test_text_data_vec, test_number_data_vec, test_labels)).batch(1)

    local_logdir = '../summaries/model_{}/k_{}'.format(model_name, k_count)
    train_summary_writer = tf.summary.create_file_writer('{}/train'.format(local_logdir))
    test_summary_writer = tf.summary.create_file_writer('{}/validation'.format(local_logdir))

    EPOCHS = 101
    for epoch in range(EPOCHS):
        for text, number, labels in train_ds:
            with train_summary_writer.as_default():
                train_step(text, number,labels)
        for text, number, labels in validation_ds:
            with test_summary_writer.as_default():    
                test_step(text, number, labels, optimizer.iterations)

    template = 'Epoch {}, Loss: {}, Accuracy: {}, validation Loss: {}, validation Accuracy: {}'
    print(template.format(epoch + 1,
                          train_loss.result(),
                          train_accuracy.result() * 100,
                          test_loss.result(),
                          test_accuracy.result() * 100))
    
    if epoch != 0 and epoch % 5 == 0 and min(best_accuracies) < (test_accuracy.result() * 100):
        index = best_accuracies.index(min(best_accuracies))
        best_accuracies[best_accuracies.index(min(best_accuracies))] = test_accuracy.result() * 100
        if best_accuracies_epoch[index] != 0:
            tf.io.gfile.remove('../models/model_{}/k_{}/model_{}'.format(model_name, k_count, best_accuracies_epoch[index]))
        best_accuracies_epoch[index] = epoch 
        model.save_weights('../models/model_{}/k_{}/model_{}'.format(model_name, k_count, epoch), save_format='h5')

    # Reset the metrics for the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
  
    # Evaluate saved models from previous fold.
    # First create folder to save outputs.
    fold_evaluation_path = '../models/model_{}_evaluation/k_{}'.format(model_name, k_count)
    models_path = '../models/model_{}/k_{}'.format(model_name, k_count)
    tf.io.gfile.makedirs(fold_evaluation_path)

    # Generate metric sheet for fold models
    out_columns = ['Probabilities', 'Prediction', 'Label', 'OrigLabel', 'Name', 'Username', 'Bio', 'Location', 'Url', 'Tweets', 'Following', 'Followers',
                 'Likes', 'Media']
    metric_columns = ['modelname', 'TP', 'FP', 'FN', 'TN', 'Precision', 'Recall', 'F1Score']
    model_outputs = pd.DataFrame(columns=out_columns)
    model_metrics = pd.DataFrame(columns=metric_columns)
    model_metric_counter = 0

    for test_model_name in tf.io.gfile.listdir(models_path):
        model.load_weights('{}/{}'.format(models_path, test_model_name))
        model_preds, test_labels = [], []
        model_output_counter = 0

        for text, number, labels in test_ds:
            predictions = model(text, number)
            prediction = [int(x) for x in tf.math.round(predictions).numpy()][0]
            label = [int(x) for x in labels.numpy()][0]
            model_preds.append(prediction)
            test_labels.append(label)

            # print(model_output_counter, list(test_index)[model_output_counter])
            row = orig_data.iloc[list(test_index)[model_output_counter], :]
            proba = predictions.numpy()[0, 0]
            proba = 1 - proba if proba < 0.5 else proba
            row_bio = row['bio'] if row['bio'] else ''
            row_location = row['location'] if row['location'] else ''
            row_url = row['url'] if row['url'] else ''
            row_tweets = row['tweets'] if row['tweets'] else ''
            row_following = row['following'] if row['following'] else ''
            row_followers = row['followers'] if row['followers'] else ''
            row_likes = row['likes'] if row['likes'] else ''
            row_media = row['media'] if row['media'] else ''
            model_outputs.loc[model_output_counter] = [proba, prediction, label, row['label'], row['name'], row['username'], 
                                                     row_bio, row_location, row_url, row_tweets, row_following, row_followers, 
                                                     row_likes, row_media]
        model_output_counter += 1

    confusion = tf.math.confusion_matrix(labels=test_labels, predictions=model_preds, num_classes=2)
    confusion_numpy = confusion.numpy()
    pres_recall = precision_recall_fscore_support(test_labels, model_preds,  average='weighted')

    tp, fp, fn, tn = confusion_numpy[0,0], confusion_numpy[0,1], confusion_numpy[1,0], confusion_numpy[1,1]
    precision, recall, f1_score = pres_recall[0], pres_recall[1], pres_recall[2]

    current_model = 'model_{}_k_{}_{}'.format(model_name, k_count, test_model_name)
    model_metrics.loc[model_metric_counter] = [current_model, tp, fp, fn, tn, precision, recall, f1_score]
    model_metric_counter += 1
    print('model_{}_k_{}_{} done ...'.format(model_name, k_count, test_model_name))

    model_outputs.to_csv('{}_fold{}_{}_preds.csv'.format(model_name, k_count, test_model_name), encoding='utf-8')  
    tf.io.gfile.copy('{}_fold{}_{}_preds.csv'.format(model_name, k_count, test_model_name), '{}/{}_fold{}_{}_preds.csv'.format(fold_evaluation_path, model_name, k_count, test_model_name))

    model_metrics.to_csv('{}_fold{}_evalutation.csv'.format(model_name, k_count), encoding='utf-8')
    tf.io.gfile.copy('{}_fold{}_evalutation.csv'.format(model_name, k_count), '{}/{}_fold{}_evalutation.csv'.format(fold_evaluation_path, model_name, k_count))

    
  