<a href="https://colab.research.google.com/github/tejaspradhan/Graph-Neural-Networks/blob/main/personality-analysis-project/Personality_Analysis_GNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
! pip install -U tf_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import os
from tf_geometric.utils import tf_utils

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import numpy as np
import pandas as pd
from collections import Counter

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
# # In case of any corpus are missing 
# download all-nltk
nltk.download('stopwords')
stop_words = stopwords.words("english")

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
import tensorflow as tf
import tf_geometric as tfg
import pickle
import re

  formatvalue=lambda value: "")[1:-1]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
data = pd.read_csv('/content/mbti_cleaned.csv')


In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,type,Number of posts,Posts
0,0,INFJ,50,intj moments sportscenter plays pra...
1,1,ENTP,50,finding lack these posts very alarmingsex...
2,2,INTP,50,good course which know thats blessi...
3,3,INTJ,50,dear intp enjoyed conversation other es...
4,4,ENTJ,50,youre firedthats another silly misconception t...


In [9]:
posts = np.array(data['Posts'])
posts.astype(np.dtype('str'))
data['Posts']

0         intj moments    sportscenter    plays    pra...
1        finding  lack    these posts very alarmingsex...
2       good       course  which    know thats  blessi...
3       dear intp    enjoyed  conversation  other   es...
4       youre firedthats another silly misconception t...
                              ...                        
7582     just because  always think  cats   doms  some...
7583    soif this thread already exists someplace else...
7584     many questions when   these things   would ta...
7585      very conflicted right  when  comes  wanting ...
7586      been  long since  have been  personalitycafe...
Name: Posts, Length: 7587, dtype: object

In [10]:
data['type'].unique().shape[0] # 16 different personalities

16

In [11]:
data.isna().sum()

Unnamed: 0          0
type                0
Number of posts     0
Posts              42
dtype: int64

In [12]:
data.dropna(inplace=True)

## Approach 1 : 16 class classification

In [13]:
texts = data['Posts']

In [14]:
encoder = LabelEncoder()
labels = encoder.fit_transform(data['type'])

In [15]:
set(labels)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

In [16]:
texts = texts.astype('string')

In [17]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.3)

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts[:254])
tokenizer.fit_on_texts(train_texts[256:])

In [19]:
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [20]:
class PMIModel(object):

    def __init__(self):
        self.word_counter = None
        self.pair_counter = None

    def get_pair_id(self, word0, word1):
        pair_id = tuple(sorted([word0, word1]))
        return pair_id

    def fit(self, sequences, window_size):

        self.word_counter = Counter()
        self.pair_counter = Counter()
        num_windows = 0
        for sequence in tqdm(sequences):
            for offset in range(len(sequence) - window_size):
                window = sequence[offset:offset + window_size]
                num_windows += 1
                for i, word0 in enumerate(window):
                    self.word_counter[word0] += 1
                    for j, word1 in enumerate(window[i + 1:]):
                        pair_id = self.get_pair_id(word0, word1)
                        self.pair_counter[pair_id] += 1

        for word, count in self.word_counter.items():
            self.word_counter[word] = count / num_windows
        for pair_id, count in self.pair_counter.items():
            self.pair_counter[pair_id] = count / num_windows

    def transform(self, word0, word1):
        prob_a = self.word_counter[word0]
        prob_b = self.word_counter[word1]
        pair_id = self.get_pair_id(word0, word1)
        prob_pair = self.pair_counter[pair_id]

        if prob_a == 0 or prob_b == 0 or prob_pair == 0:
            return 0

        pmi = np.log(prob_pair / (prob_a * prob_b))
        # print(word0, word1, pmi)
        pmi = np.maximum(pmi, 0.0)
        # print(pmi)
        return pmi

In [21]:
def build_word_graph(num_words, pmi_model, embedding_size):
    x = tf.Variable(tf.random.truncated_normal([num_words, embedding_size], stddev=1 / np.sqrt(embedding_size)),
                    dtype=tf.float32)
    edges = []
    edge_weight = []
    for (word0, word1) in pmi_model.pair_counter.keys():
        pmi = pmi_model.transform(word0, word1)
        if pmi > 0:
            edges.append([word0, word1])
            edge_weight.append(pmi)
            edges.append([word1, word0])
            edge_weight.append(pmi)
    edge_index = np.array(edges).T
    return tfg.Graph(x=x, edge_index=edge_index, edge_weight=edge_weight)

In [22]:
def build_combined_graph(word_graph, sequences, embedding_size):
    num_words = word_graph.num_nodes
    x = tf.zeros([len(sequences), embedding_size], dtype=tf.float32)
    edges = []
    edge_weight = []
    for i, sequence in enumerate(sequences):
        doc_node_index = num_words + i
        for word in sequence:
            edges.append([doc_node_index, word])  # only directed edge
            edge_weight.append(1.0)  # use BOW instaead of TF-IDF

    edge_index = np.array(edges).T
    x = tf.concat([word_graph.x, x], axis=0)
    edge_index = np.concatenate([word_graph.edge_index, edge_index], axis=1)
    edge_weight = np.concatenate([word_graph.edge_weight, edge_weight], axis=0)
    return tfg.Graph(x=x, edge_index=edge_index, edge_weight=edge_weight)

In [49]:
pmi_cache_path = "cached_pmi_model.p"
if os.path.exists(pmi_cache_path):
    with open(pmi_cache_path, "rb") as f:
        pmi_model = pickle.load(f)
else:
    pmi_model = PMIModel()
    pmi_model.fit(train_sequences, window_size=6)
    with open(pmi_cache_path, "wb") as f:
        pickle.dump(pmi_model, f)

embedding_size = 150
num_words = len(tokenizer.word_index) + 1
word_graph = build_word_graph(num_words, pmi_model, embedding_size)
train_combined_graph = build_combined_graph(word_graph, train_sequences, embedding_size)
test_combined_graph = build_combined_graph(word_graph, test_sequences, embedding_size)

print(word_graph)
print(train_combined_graph)
print(test_combined_graph)

num_classes = 16

Graph Shape: x => (165430, 150)	edge_index => (2, 8508496)	y => None
Graph Shape: x => (170711, 150)	edge_index => (2, 11226894)	y => None
Graph Shape: x => (167694, 150)	edge_index => (2, 9630760)	y => None


In [42]:
print(train_combined_graph)
print(test_combined_graph)

Graph Shape: x => (167694, 150)	edge_index => (2, 9630760)	y => None
Graph Shape: x => (167694, 150)	edge_index => (2, 9630760)	y => None


In [50]:
class GCNModel(tf.keras.Model):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.gcn0 = tfg.layers.GCN(32, activation=tf.nn.relu)
        self.gcn1 = tfg.layers.GCN(32,activation = tf.nn.relu)
        self.gcn2 = tfg.layers.GCN(64,activation = tf.nn.relu)
        self.gcn3 = tfg.layers.GCN(num_classes)
        self.dropout = keras.layers.Dropout(0.5)

    def call(self, inputs, training=None, mask=None, cache=None):
        x, edge_index, edge_weight = inputs
        h = self.gcn0([x, edge_index, edge_weight], cache=cache)
        h = self.gcn1([h, edge_index, edge_weight],cache=cache)
        h = self.dropout(h, training=training);
        h = self.gcn2([h, edge_index, edge_weight],cache=cache)
        h = self.gcn3([h, edge_index, edge_weight], cache=cache)
        return h

In [51]:
model = GCNModel()
model.gcn0.cache_normed_edge(train_combined_graph)
model.gcn0.cache_normed_edge(test_combined_graph)



In [52]:
@tf_utils.function
def forward(graph, training=False):
    logits = model([graph.x, graph.edge_index, graph.edge_weight], cache=graph.cache, training=training)
    logits = logits[num_words:]
    return logits


In [53]:
def compute_loss(logits, labels):
    losses = tf.nn.softmax_cross_entropy_with_logits(
        logits=logits,
        labels=tf.one_hot(labels, depth=num_classes)
    )
    # print("Transformed labels", tf.one_hot(labels, depth=num_classes)[0])
    mean_loss = tf.reduce_mean(losses)
    return mean_loss

In [54]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.05)

In [56]:
for step in range(1000):
    with tf.GradientTape() as tape:
        logits = forward(train_combined_graph, training=True)
        # print("logits" ,logits[0],"Shape",logits[0].shape)
        mean_loss = compute_loss(logits, train_labels)

    vars = tape.watched_variables()
    grads = tape.gradient(mean_loss, vars)
    optimizer.apply_gradients(zip(grads, vars))

    if step % 10 == 0:
        # train accuracytf.one_hot(labels, depth=num_classes)
        preds = tf.argmax(logits, axis=-1)
        corrects = tf.cast(tf.equal(preds, train_labels), tf.float32)
        train_accuracy = tf.reduce_mean(corrects)

        logits = forward(test_combined_graph)
        preds = tf.argmax(logits, axis=-1)
        corrects = tf.cast(tf.equal(preds, test_labels), tf.float32)
        accuracy = tf.reduce_mean(corrects)
        print("step = {}\tloss = {}\ttrain_accuracy = {}\ttest_accuracy = {}".format(step, mean_loss, train_accuracy,accuracy))

step = 0	loss = 1.7703341245651245	train_accuracy = 0.4145048260688782	test_accuracy = 0.20406360924243927
step = 10	loss = 1.7787028551101685	train_accuracy = 0.4054156541824341	test_accuracy = 0.1992049515247345
step = 20	loss = 1.7721985578536987	train_accuracy = 0.4139367640018463	test_accuracy = 0.208480566740036
step = 30	loss = 1.7809900045394897	train_accuracy = 0.41128572821617126	test_accuracy = 0.19964663684368134
step = 40	loss = 1.7856276035308838	train_accuracy = 0.4124218821525574	test_accuracy = 0.2031802088022232
step = 50	loss = 1.7928729057312012	train_accuracy = 0.40825602412223816	test_accuracy = 0.2005300372838974
step = 60	loss = 1.769852876663208	train_accuracy = 0.4199962019920349	test_accuracy = 0.20008833706378937
step = 70	loss = 1.8132067918777466	train_accuracy = 0.4093921482563019	test_accuracy = 0.2018551230430603
step = 80	loss = 1.7808493375778198	train_accuracy = 0.41620904207229614	test_accuracy = 0.19832155108451843
step = 90	loss = 1.78851509094238

## Approach 2 - 4 binary classifiers - one hot encoding