In [1]:
import logging

from collections import defaultdict
from itertools import combinations

from gensim import corpora
from gensim.corpora import Dictionary
from nltk.corpus import wordnet as wn

In [2]:
dictionary = corpora.Dictionary.load('../data/spa.dict')
corpus = corpora.MmCorpus('../data/spa.mm')

syn_dict = Dictionary()

In [3]:
syn_map = defaultdict(set)

word_count = 0
for term in dictionary.values():
    word_count += 1

    syns = wn.synsets(term)
    
    for syn_obj in syns:
        syn = syn_obj.name().split('.')[0]
        
        if syn in syn_dict.token2id:
            syn_map[syn_dict.token2id[syn]].add(term)
            
        else:
            syn_dict.add_documents([[syn]])
            syn_map[syn_dict.token2id[syn]].add(term)

## Triangles
For every syn, there is a term in the SPA associated with it, sometimes multiple, creating a triangle effect.
Here is a POC for this.

From this, you can create new synonym nodes on the pre-existing graph.

In [6]:
for syn_id, terms in syn_map.items():
    if len(terms) > 1:
        print('\n')
        print(f"SYN --> {syn_dict[syn_id]}")
        print(f"SPA --> {terms}")



SYN --> buy
SPA --> {'buying', 'bought'}


SYN --> bribe
SPA --> {'corrupt', 'corrupted', 'bribe', 'bought', 'bribes', 'buying'}


SYN --> breast
SPA --> {'chest', 'bosoms', 'breast', 'bosom', 'chests', 'breasts'}


SYN --> summit
SPA --> {'breast', 'breasts'}


SYN --> front
SPA --> {'looked', 'faced', 'breast', 'looking', 'front', 'breasts', 'facing', 'face', 'looks', 'look', 'fronts', 'faces'}


SYN --> brow
SPA --> {'brows', 'brow'}


SYN --> eyebrow
SPA --> {'brows', 'brow'}


SYN --> hilltop
SPA --> {'brows', 'brow'}


SYN --> care
SPA --> {'cares', 'care', 'attention', 'caring', 'cared', 'charge'}


SYN --> caution
SPA --> {'cares', 'care'}


SYN --> concern
SPA --> {'fears', 'interest', 'cares', 'worried', 'fear', 'care', 'concerned'}


SYN --> wish
SPA --> {'wants', 'bade', 'cares', 'wished', 'want', 'care', 'wishing', 'caring', 'like', 'cared', 'wish', 'bidding'}


SYN --> manage
SPA --> {'deal', 'managed', 'cares', 'care', 'handling', 'caring', 'cared'}


SYN --> worry
SPA

SPA --> {'forced', 'pushing', 'forcing', 'crowds', 'force', 'push', 'thrust', 'crowding', 'forces'}


SYN --> thrust
SPA --> {'thrust', 'stuff'}


SYN --> lunge
SPA --> {'thrust', 'hurled'}


SYN --> pierce
SPA --> {'pierced', 'thrust', 'piercing', 'pierce'}


SYN --> throw
SPA --> {'switching', 'thrown', 'thrust', 'throw', 'threw', 'strokes'}


SYN --> time
SPA --> {'time', 'times'}


SYN --> clock_time
SPA --> {'time', 'times'}


SYN --> fourth_dimension
SPA --> {'time', 'times'}


SYN --> meter
SPA --> {'time', 'beats', 'times', 'beat'}


SYN --> prison_term
SPA --> {'sentence', 'time', 'times'}


SYN --> clock
SPA --> {'clocks', 'time', 'times', 'clock'}


SYN --> very
SPA --> {'real', 'really', 'very'}


SYN --> merit
SPA --> {'merit', 'virtue'}


SYN --> wife
SPA --> {'wives', 'wife'}


SYN --> word
SPA --> {'word', 'words'}


SYN --> news
SPA --> {'word', 'intelligence', 'words'}


SYN --> discussion
SPA --> {'word', 'words'}


SYN --> parole
SPA --> {'word', 'words'}


SYN --> 

SYN --> language
SPA --> {'language', 'speech', 'speeches'}


SYN --> speech
SPA --> {'language', 'speech', 'speeches'}


SYN --> lyric
SPA --> {'language', 'lyric', 'words'}


SYN --> terminology
SPA --> {'nomenclature', 'language'}


SYN --> leader
SPA --> {'leader', 'leaders'}


SYN --> drawing_card
SPA --> {'leaders', 'draws', 'attraction', 'leader', 'draw'}


SYN --> member
SPA --> {'members', 'member'}


SYN --> extremity
SPA --> {'members', 'member'}


SYN --> penis
SPA --> {'members', 'member'}


SYN --> mother
SPA --> {'mother', 'mothers'}


SYN --> service
SPA --> {'served', 'serve', 'service', 'serving'}


SYN --> avail
SPA --> {'help', 'helping', 'service', 'helped'}


SYN --> state_of_matter
SPA --> {'states', 'state'}


SYN --> department_of_state
SPA --> {'states', 'state'}


SYN --> submit
SPA --> {'states', 'subject', 'submit', 'state', 'bowed'}


SYN --> volition
SPA --> {'will', 'wills'}


SYN --> will
SPA --> {'will', 'wills'}


SYN --> bequeath
SPA --> {'leaves', '

SYN --> godforsaken
SPA --> {'wildest', 'wild', 'waste'}


SYN --> break
SPA --> {'break', 'stop', 'broke', 'worn', 'wore', 'separate', 'bursted', 'wear', 'burst', 'breaks', 'stopping', 'wearing', 'bust', 'stopped', 'broken', 'stops'}


SYN --> transgress
SPA --> {'broken', 'break', 'breach', 'broke', 'transgressed', 'breaks'}


SYN --> break_in
SPA --> {'broke', 'breaks', 'break', 'broken'}


SYN --> violate
SPA --> {'broke', 'breaks', 'break', 'broken'}


SYN --> unwrap
SPA --> {'break', 'expose', 'exposed', 'broke', 'breaks', 'discovered', 'broken'}


SYN --> dampen
SPA --> {'softened', 'broken', 'break', 'damps', 'broke', 'breaks'}


SYN --> demote
SPA --> {'broken', 'break', 'broke', 'bump', 'breaks'}


SYN --> break_dance
SPA --> {'broke', 'breaks', 'break', 'broken'}


SYN --> crack
SPA --> {'checked', 'pass', 'break', 'broke', 'check', 'cracks', 'crack', 'whirls', 'passes', 'breaks', 'offer', 'fling', 'broken', 'goes'}


SYN --> fracture
SPA --> {'break', 'broke', 'cracks', 'cr


SYN --> irons
SPA --> {'chains', 'irons'}


SYN --> chain
SPA --> {'chains', 'strings', 'chain'}


SYN --> escape
SPA --> {'escaped', 'flight', 'escape'}


SYN --> get_off
SPA --> {'escaped', 'escape'}


SYN --> elude
SPA --> {'escaped', 'escape'}


SYN --> asleep
SPA --> {'numb', 'gone', 'departed', 'asleep', 'deceased'}


SYN --> bygone
SPA --> {'departed', 'gone'}


SYN --> cold
SPA --> {'coldness', 'inhuman', 'cold'}


SYN --> meadowlark
SPA --> {'larks', 'lark'}


SYN --> pipit
SPA --> {'larks', 'lark'}


SYN --> lark
SPA --> {'larks', 'lark'}


SYN --> escapade
SPA --> {'larks', 'lark'}


SYN --> frolic
SPA --> {'sport', 'larks', 'lark'}


SYN --> marriage
SPA --> {'marriage', 'wedding', 'union', 'matrimony'}


SYN --> hideous
SPA --> {'hideous', 'outrageous'}


SYN --> leave_office
SPA --> {'quit', 'resign'}


SYN --> resign
SPA --> {'submit', 'resign'}


SYN --> believe
SPA --> {'trusted', 'believing', 'believe', 'believed'}


SYN --> careworn
SPA --> {'drawn', 'haggard', 'wor

SPA --> {'wildest', 'violent', 'wild'}


SYN --> baseless
SPA --> {'wildest', 'wild', 'idle'}


SYN --> raving_mad
SPA --> {'wildest', 'wild'}


SYN --> hazardous
SPA --> {'wildest', 'wild'}


SYN --> crazy
SPA --> {'wildest', 'wild'}


SYN --> barbarian
SPA --> {'wildest', 'wild'}


SYN --> angry
SPA --> {'wildest', 'furious', 'wild'}


SYN --> defeated
SPA --> {'defeated', 'thwarted'}


SYN --> discredit
SPA --> {'disgrace', 'discredited', 'disgraced'}


SYN --> discredited
SPA --> {'discredited', 'disgraced'}


SYN --> fasten
SPA --> {'fixed', 'fasten'}


SYN --> tighten
SPA --> {'tightened', 'fasten'}


SYN --> ignored
SPA --> {'neglected', 'ignored'}


SYN --> lecherousness
SPA --> {'lusts', 'lust'}


SYN --> lust
SPA --> {'lusts', 'lust'}


SYN --> insurgent
SPA --> {'rebel', 'rebels'}


SYN --> maverick
SPA --> {'rebel', 'rebels'}


SYN --> enjoy
SPA --> {'enjoying', 'savor', 'enjoy'}


SYN --> bally
SPA --> {'flaming', 'bloody'}


SYN --> broke
SPA --> {'bust', 'broke'}


SYN -

In [14]:
import networkx as nx
pickled_graph = nx.read_gpickle("../logic/test.gpickle")

In [15]:
for ind, node in pickled_graph.nodes(data=True):
    pickled_graph.node[ind]['type'] = 'SPA'

In [20]:
pickled_graph.node[1]

{'vector_ind': 0, 'term_id': 20, 'freq_per_doc': 1.0, 'type': 'SPA'}

In [21]:
dictionary[pickled_graph.node[1]['term_id']]

'hands'

In [None]:
syns_per_term = defaultdict(set)

syn_dict = Dictionary()
syn_to_node_map = dict()

# Keep updating this as we add in the synonyms
node_count = len(pickled_graph.nodes)

for term_ind, node in pickled_graph.nodes(data=True):
    term = dictionary[node['term_id']]
    
    # syn nodes have already been created, the values are in in the list
    if term in syns_per_term:
        for syn_id in syns_per_term[term]:
            pickled_graph.add_edge(term_ind, syn_id, attr_dict={'weight': 0.5})
         
    # synonyms have not been created for this term
    else:
        # get syns for term
        syns = wn.synsets(term)
        
        for syn_obj in syns:
            # extracts the text value from the syn object
            syn = syn_obj.name().split('.')[0]
            
            if syn not in syn_dict.token2id:
                # add syn term to dictionary
                syn_dict.add_documents([[syn]])
                
                # add syn node to graph
                SPA_graph.add_node(node_count, type='SYN', term_id=syn_dict.token2id[syn], freq_per_doc=-1, vector_ind=-1)
                syn_to_node_map[syn_dict.token2id[syn]] = node_count
                
                # Keep track of values
                node_count += 1
            
            if syn not in syns_per_term[term]:
                pickled_graph.add_edge(ind, syn_id, attr_dict={'weight': 0.5})