In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# data path
train_filename = 'mbti_1.csv'

'''or by the following'''
data = pd.read_csv(train_filename,  encoding='utf-8')
# fillna prefer not to be done when import data
#data = data.fillna('')
data

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
5,INTJ,'18/37 @.@|||Science is not perfect. No scien...
6,INFJ,"'No, I can't draw on my own nails (haha). Thos..."
7,INTJ,'I tend to build up a collection of things on ...
8,INFJ,"I'm not sure, that's a good question. The dist..."
9,INTP,'https://www.youtube.com/watch?v=w8-egj0y8Qs||...


In [3]:
import re
import nltk
import string
import igraph
import itertools
import unicodedata
from nltk.corpus import stopwords
from nltk import pos_tag

In [4]:
def clean_text(text, my_stopwords, punct, remove_stopwords=True, lower_case=True):
    #print(text)

    if lower_case:
        text = text.lower()
    text = ''.join(l for l in text if l not in punct)  # remove punctuation (preserving intra-word dashes)
    text = re.sub(' +', ' ', text)  # strip extra white space
    text = text.strip()  # strip leading and trailing white space
    # tokenize (split based on whitespace)
    tokens = text.split(' ')

    if remove_stopwords:
        # remove stopwords
        tokens = [token for token in tokens if token not in my_stopwords]

    return tokens


def pos_filter(tokens):
    # POS tag and retain only nouns and adjectives
    tagged_tokens = pos_tag(tokens)
    tokens_keep = []
    for item in tagged_tokens:
        if (
            item[1] == 'NN' or
            item[1] == 'NNS' or
            item[1] == 'NNP' or
            item[1] == 'NNPS' or
            item[1] == 'JJ' or
            item[1] == 'JJS' or
            item[1] == 'JJR' or
            item[1] == 'WP' or
            item[1] == 'WRB' or
            item[1] == 'WDT' or
            item[1] == 'PRP' or
            item[1] == 'CD' or
            item[1] == 'VBP' or # is are ..
            item[1] == 'VB' or
            item[1] == 'VBZ' or
            item[1] == 'VBD' or
            item[1] == 'VBN' or
            item[1] == 'RB' # verb
        ):
            # keep some kinds of tags
            tokens_keep.append(item[0])

    tokens = tokens_keep

    return tokens, tagged_tokens

def strip_accents_unicode(s):
    s = unicodedata.normalize('NFD', s)
    s = s.encode('ascii', 'ignore')
    s = s.decode("utf-8")
    return str(s)

def stem_words(tokens):
    # apply Porter's stemmer
    stemmer = nltk.stem.PorterStemmer()
    tokens_stemmed = list()
    for token in tokens:
        tokens_stemmed.append(stemmer.stem(token))
    tokens = list(map(lambda x : strip_accents_unicode(x), tokens_stemmed))

    return (tokens)


def terms_to_graph(terms, w):
    '''This function returns a directed, weighted igraph from a list of terms
    (the tokens from the pre-processed text) e.g., ['quick','brown','fox'].
    Edges are weighted based on term co-occurence
    within a sliding window of fixed size 'w'.
    '''
    print(terms)

    if w > len(terms):
        w = len(terms)

    from_to = {}

    # create initial complete graph (first w terms)
    terms_temp = terms[0:w]
    indexes = list(itertools.combinations(range(w), r=2))

    new_edges = []

    for my_tuple in indexes:
        #print(my_tuple, terms_temp)
        new_edges.append(tuple([terms_temp[i] for i in my_tuple]))

    for new_edge in new_edges:
        if new_edge in from_to:
            from_to[new_edge] += 1
        else:
            from_to[new_edge] = 1

    # then iterate over the remaining terms
    for i in range(w, len(terms)):
        considered_term = terms[i]  # term to consider
        terms_temp = terms[(i - w + 1):(i + 1)]  # all terms within sliding window

        # edges to try
        candidate_edges = []
        for p in range(w - 1):
            candidate_edges.append((terms_temp[p], considered_term))

        for try_edge in candidate_edges:
            if try_edge[1] != try_edge[0]:
                # if not self-edge

                # if edge has already been seen, update its weight
                if try_edge in from_to:
                    from_to[try_edge] += 1

                # if edge has never been seen, create it and assign it a unit weight
                else:
                    from_to[try_edge] = 1

    # create empty graph
    g = igraph.Graph(directed=True)

    # add vertices
    sorted_terms = sorted(set(terms))
    g.add_vertices(sorted_terms)
    g.vs["label"] = sorted_terms

    # add edges, direction is preserved since the graph is directed
    g.add_edges(from_to.keys())

    # set edge and vertex weights
    g.es['weight'] = from_to.values()  # based on co-occurence within sliding window
    #g.vs['weight'] = g.strength(weights=from_to.values())  # weighted degree

    return (g)

In [5]:
mystopwords = stopwords.words('english')
clt = clean_text(data.iloc[3, 1], my_stopwords=mystopwords, punct=string.punctuation)
tokens, tagged_tokens = pos_filter(tokens=clt)
token_stem = stem_words(tokens)
print(token_stem)

['dear', 'intp', 'enjoy', 'convers', 'day', 'esoter', 'gab', 'natur', 'univers', 'idea', 'rule', 'social', 'code', 'arbitrari', 'construct', 'createddear', 'entj', 'sub', 'long', 'time', 'see', 'sincer', 'alphanon', 'type', 'hurt', 'deep', 'existenti', 'way', 'want', 'part', 'ofprob', 'scale', 'depend', 'individu', 'prefer', 'everyth', 'humanitydraco', 'malfoy', 'also', 'id', 'say', 'he', 'either', '358', '368im', 'either', '358', '385', 'somewhat', 'arbitrari', 'distinct', 'make', 'believ', 'core', 'indic', 'primari', 'motiv', 'hand', 'action', 'therefor', 'aim', 'particularli', 'introvert', 'extravert', 'person', 'said', 'say', 'im', 'somewhat', 'unphas', 'social', 'interact', 'alon', 'id', 'say', 'crave', 'anyth', 'isdear', 'type', '9', 'infp', 'absolut', 'admir', 'your', 'great', 'girlfriend', 'wish', 'didnt', 'busi', 'schedul', 'around', 'one', 'often', 'keep2', 'still', 'mean', '150', 'peopl', 'ive', 'probabl', 'seen', '12', 'other', 'today', 'never', 'understood', 'fascin', 'vir

In [None]:
import string
import nltk

mypunct = string.punctuation

def words2graph(texts, stopwords=mystopwords, punct=mypunct, window=4):
    '''
    :param texts:
    :param stopwords:
    :param punct:
    :param window:
    :return:
    '''
    graphs = {}
    for key in texts.index:
        tokens = clean_text(texts.iloc[key, 1], my_stopwords=stopwords, punct=punct, remove_stopwords=True)
        tokens, tagged_tokens = pos_filter(tokens=tokens)
        token_stem = stem_words(tokens)

        gra = terms_to_graph(token_stem, w=window)
        graphs[key] = gra

    return graphs


def graph_model():
    pass



In [None]:
words2graph(data)

['httpwwwyoutubecomwatchvqsxhcwe3krwhttp41mediatumblrcomtumblrlfouy03pma1qa1rooo1500jpgenfp', 'intj', 'moment', 'httpswwwyoutubecomwatchviz7le1g4xm4', 'sportscent', 'top', 'ten', 'play', 'httpswwwyoutubecomwatchvucdfze1etec', 'experi', 'lifehttpwwwyoutubecomwatchvvxzeywwrdw8', 'httpwwwyoutubecomwatchvu8ejam5dp3', 'repeat', 'todaymay', 'perc', 'experi', 'immers', 'youth', 'last', 'thing', 'infj', 'friend', 'post', 'facebook', 'suicid', 'next', 'day', 'rest', 'peac', 'httpvimeocom22842206hello', 'enfj7', 'sorri', 'hear', 'distress', 'natur', 'relationship', 'perfect', 'time', 'moment', 'exist', 'tri', 'figur', 'hard', 'time', 'time', 'growth', 'as84389', '84390', 'httpwallpaperpassioncomupload23700friendshipboyandgirlwallpaperjpg', 'httpassetsdornobcomwpcontentuploads201004roundhomedesignjpg', 'welcom', 'stuffhttpplayeressencecomwpcontentuploads201308redredthepokemonmaster32560474450338jpg', 'game', 'set', 'matchprozac', 'wellbrutin', 'least', 'thirti', 'minut', 'leg', 'dont', 'mean', 'd