<a href="https://colab.research.google.com/github/thiagomotax/nlp_dl_user_profiles/blob/main/deep_learning_mypersonality_tcc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 1.x

In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)


df = pd.read_csv('/content/drive/My Drive/tcc/datasets/mypersonality/mypersonality_final.csv', encoding="ISO-8859-1")
df = df.drop(['#AUTHID',  'sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN', 'DATE', 'NETWORKSIZE', 'BETWEENNESS', 'NBETWEENNESS','DENSITY','BROKERAGE','NBROKERAGE','TRANSITIVITY'], axis = 1) 
df.cNEU.replace(to_replace=['n', 'y'], value=[0, 1], inplace=True)
df.cEXT.replace(to_replace=['n', 'y'], value=[0, 1], inplace=True)
df.cAGR.replace(to_replace=['n', 'y'], value=[0, 1], inplace=True)
df.cCON.replace(to_replace=['n', 'y'], value=[0, 1], inplace=True)
df.cOPN.replace(to_replace=['n', 'y'], value=[0, 1], inplace=True)

In [None]:
df.head()

In [None]:
#pre-processing

import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
newStopWords = ['propname', 'im', 'propnames' '1', '2', '3', '4', '5', '6', '7', '8', '9']
stop.extend(newStopWords)


df['STATUS'] = df['STATUS'].str.lower() #lowercase
df['STATUS'] = df['STATUS'].str.replace('[{}]'.format(string.punctuation), '') #ponctuaction
df['STATUS'] = df['STATUS'].str.replace(r'\d+','') #numbers
df['STATUS'] = df['STATUS'].str.replace(' +', ' ') #this should replace all multiple spaces with a single space
df['STATUS'] = df['STATUS'].str.strip() #remove all spaces from the start and end
df['STATUS'] = df['STATUS'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop])) #stopwords

# df['STATUS'] = [x.replace("propnames", "") for x in df['STATUS']]
# df['STATUS'] = [x.replace("propname", "") for x in df['STATUS']]

df.replace("", np.nan, inplace=True) #empty lines to nan
df.dropna(how='any', inplace=True) #remove nan

In [None]:
#prepare EDA input to proccess in local computer (due to incompatibilities of the library with google collab)
label_cols = ["cEXT","cNEU","cAGR","cCON","cOPN"]
df['one_hot_labels'] = df['cEXT'].map(str) + df['cNEU'].map(str) + df['cAGR'].map(str) + df['cCON'].map(str) + df['cOPN'].map(str)
df = df.drop(["cEXT","cNEU","cAGR","cCON","cOPN"], axis=1)
frame = df[['one_hot_labels', 'STATUS']]
frame.to_csv('input_mypersonalityEDA.txt', header=False, index=False, sep='\t', mode='a') 

In [None]:
#receives input processed by EDA (output)
import re
new_df = pd.read_csv('/content/output_mypersonalityEDA.txt', names=['content'], sep="\f", header=None)
rows_list = []
for index, row in new_df.iterrows():
  props = re.split(r'\t+', row['content'])
  cEXT = props[0][0]
  cNEU = props[0][1]
  cAGR = props[0][2]
  cCON = props[0][3]
  cOPN = props[0][4]
  STATUS = props[1]
  dataTemp = {}
  dataTemp.update({'cEXT':cEXT, 'cNEU':cNEU, 'cAGR':cAGR, 'cCON':cCON, 'cOPN':cOPN, 'STATUS':STATUS})
  rows_list.append(dataTemp)

df = pd.DataFrame(rows_list, columns=["cEXT","cNEU","cAGR","cCON","cOPN", "STATUS"])

In [None]:
df.STATUS = df.STATUS.astype(str)

In [None]:
df.STATUS

In [None]:
df.info()

In [None]:
#split training and test sets
from sklearn.model_selection import train_test_split
list_classes = ["cEXT","cNEU","cAGR","cCON","cOPN"]
train, test = train_test_split(df, random_state=42, test_size=0.15, shuffle=True)

list_sentences_train = train.STATUS
list_sentences_test = test.STATUS

y_train = train[list_classes].values
y_test = test[list_classes].values

In [None]:
#tokenize data
max_features = 5000 

tokenizer = Tokenizer(num_words=max_features, lower=True, filters='') #Only the most common num_words-1 will kept, by default, all punctuation is removed, turning the texts into space-separated sequences of words

tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [None]:
# pad data
maxlen = 20
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)


In [None]:
import numpy as np
import gc

In [None]:
#function to load and convert and proccess dataset to Glove, Word2vec or FastText requirements
def loadEmbeddingMatrix(typeToLoad):
        #load different embedding fil depending on which embedding matrix are going to experiment with
        if(typeToLoad=="glove"):
            EMBEDDING_FILE='/content/drive/MyDrive/tcc/glove.6B.200d.txt'
            embed_size = 200
        elif(typeToLoad=="word2vec"):
            word2vecDict = word2vec.KeyedVectors.load_word2vec_format("../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin", binary=True)
            embed_size = 300
        elif(typeToLoad=="fasttext"):
            EMBEDDING_FILE='../input/fasttext/wiki.simple.vec'
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            #Transfer the embedding weights into a dictionary by iterating through every line of the file.
            f = open(EMBEDDING_FILE)
            for line in f:
                #split up line into an indexed array
                values = line.split()
                #first index is word
                word = values[0]
                #store the rest of the values in the array as a new array
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs #50 dimensions
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))
        else:
            embeddings_index = dict()
            for word in word2vecDict.wv.vocab:
                embeddings_index[word] = word2vecDict.word_vec(word)
            print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        #We get the mean and standard deviation of the embedding weights so that we could maintain the 
        #same statistics for the rest of our own random generated weights. 
        all_embs = np.stack(list(embeddings_index.values()))
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
        nb_words = len(tokenizer.word_index)
        #We are going to set the embedding size to the pretrained dimension as we are replicating it.
        #the size will be Number of Words in Vocab X Embedding Size
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        gc.collect()

        #With the newly created embedding matrix, we'll fill it up with the words that we have in both 
        #our own dictionary and loaded pretrained embedding. 
        embeddedCount = 0
        for word, i in tokenizer.word_index.items():
            i-=1
            #then we see if this word is in glove's dictionary, if yes, get the corresponding weights
            embedding_vector = embeddings_index.get(word)
            #and store inside the embedding matrix that we will train later on.
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
                embeddedCount+=1
        print('total embedded:',embeddedCount,'common words')
        
        del(embeddings_index)
        gc.collect()
        
        #finally, return the embedding matrix
        return embedding_matrix

In [None]:
embedding_matrix = loadEmbeddingMatrix('glove')

In [None]:
embedding_matrix.shape

(19597, 200)

In [None]:
#We begin our defining an input layer 
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier

In [None]:
#embed layer with glove

#default comented
#embed_size = 128
#x = Embedding(max_features, embed_size)(inp)

x = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1],weights=[embedding_matrix],trainable=False)(inp)

In [None]:
#base hidden and output layers

import keras
import tensorflow as tf
from keras.callbacks import EarlyStopping

#default
#x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = Bidirectional(LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1))(x)

x = GlobalMaxPool1D()(x)


x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.2)(x)

# - 1 ou +1 (4)
x = Dense(5, activation="sigmoid")(x)

# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)


model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.AUC(), tf.keras.metrics.Recall() ])

In [None]:
#training and validation parameters
from keras.callbacks import ModelCheckpoint

#filepath="weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
#ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
batch_size = 32 #refers to the number of training examples used in an iteration.
epochs = 500
history = model.fit(
    X_train,y_train, batch_size=batch_size, epochs=epochs, validation_split=0.10
    )


In [None]:
y_train

In [None]:
#word count tokenizer (useful to define ANN parameters)
x = sorted((tokenizer.word_index).items(), key=lambda x: x[1], reverse=True)
x

In [None]:
#plot
counts = []
df_status = df.drop(['STATUS'], axis=1)
categories = list(df_status.columns.values)
for i in categories:
    counts.append((i, df_status[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_comments'])

df_stats.plot(x='category', y='number_of_comments', kind='bar', legend=False, grid=True, figsize=(8, 5))
plt.title("Número de atualizações de status em cada fator")
plt.ylabel('# ocorrências', fontsize=12)
plt.xlabel('fator', fontsize=12)

In [None]:
#plot
import seaborn as sns

rowsums = df.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()
#plot
plt.figure(figsize=(8,5))
ax = sns.barplot(x.index, x.values)
plt.title("Número de atualizações de status que possuem multiplos fatores")
plt.ylabel('# de ocorrências', fontsize=12)
plt.xlabel('# de fatores', fontsize=12)

In [None]:
from keras.utils.vis_utils import plot_model

#model plot
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
#plot
totalNumWords = [len(one_comment) for one_comment in list_tokenized_train]
plt.hist(totalNumWords,bins = np.arange(0,410,10))
plt.title("Número de palavras por sentença")
plt.ylabel('# de sentenças', fontsize=12)
plt.xlabel('# de palavras', fontsize=12)
plt.show()

In [None]:
#wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

stopwords = set(STOPWORDS)
stopwords.update(["propname"])

text = " ".join(review for review in df['STATUS'])

wordcloud = WordCloud(background_color="black",
                      width=1600, height=800).generate(text)

# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

wordcloud.to_file("wordcloud.png")

In [None]:
#metric plots
import matplotlib.pyplot as plt

#accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()


#loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

#auc
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])

plt.title('model auc')
plt.ylabel('auc')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

#precision
plt.plot(history.history['precision'])
plt.plot(history.history['val_precision'])

plt.title('model precision')
plt.ylabel('precision')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

#recall
plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])

plt.title('model recall')
plt.ylabel('recall')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
#save model for later use
from keras.models import load_model
model.save("model.h5")

In [None]:
#load model 
model = load_model('model.h5')

In [None]:
#plot
traits_labels = df[["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]]


fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 8
plt.rcParams["figure.figsize"] = fig_size

traits_labels.sum(axis=0).plot.bar()

In [None]:
# predictions = model.predict(np.expand_dims(X_test[500], 0))

# print(tokenizer.sequences_to_texts([X_test[500]]))
# print(y_test[500])
# print(predictions)