In [1]:
import os
import pickle
import networkx as nx

In [2]:
import os

path_map = list()
for directory in os.listdir('../data/subgraphs_2/'):
    for file in os.listdir('../data/subgraphs_2/' + directory + '/'):
        poem_ind = file.split('.')[0]
        path_map.append((directory, poem_ind))

In [3]:
path_map[0]

('0', '1')

In [4]:
path_map_length = len(path_map)

similarity_scores = list()
for ind, (p1_ind, p2_ind) in enumerate(path_map):
    if ind % 10 == 0:
        print(f"{ind} -- {round((ind / path_map_length) * 100, 2)}%")
    
    subgraph = nx.read_gpickle(f"../data/subgraphs_2/{p1_ind}/{p2_ind}.gpickle")
    
    BASE_P1_IND = int(p1_ind)
    BASE_P2_IND = int(p2_ind)
    
    similarity_score = 0
    
    base_p1_nodes_data = [(node, data) for node, data in subgraph.nodes(data=True) if data['vector_ind'] == BASE_P1_IND]
    
    # Add both tf-idf of P1Nn and P2Nn if they are the same words across poems, no synonyms
    for p1_node, data in base_p1_nodes_data:
        # This should only ever result in either 0 or 1 nodes
        p2_neighbors_tf_idf = [
            subgraph.node[n]['tf-idf'] for n in subgraph.neighbors(p1_node) 
            if subgraph.node[n]['vector_ind'] == BASE_P2_IND
        ]

        # if it is a one-to-one relationship
        if p2_neighbors_tf_idf:
            similarity_score += data['tf-idf']

            for tf_idf in p2_neighbors_tf_idf:
                similarity_score += tf_idf

    # Calculate scores for nodes that have synonym relationships
    for p1_node, p1_data in base_p1_nodes_data:
        # for all syns.. *denoted by -1
        for syn_node in [s_n for s_n in subgraph.neighbors(p1_node) if subgraph.node[s_n]['vector_ind'] == -1]:
            # all neighbors of syn matching P1Tn where neighbor is not in P1
            syn_p2_neighbors = [n for n in subgraph.neighbors(syn_node) if subgraph.node[n]['vector_ind'] != BASE_P1_IND]

            p2_syn_scores = 0
            # for all P2 nodes associated with the syn..
            for p2_node in syn_p2_neighbors:
                p2_syn_scores += (p1_data['tf-idf'] + subgraph.node[p2_node]['tf-idf']) / len(syn_p2_neighbors)

            similarity_score += (p2_syn_scores * 0.5)
            
    # Tack on the scores for this combination
    similarity_scores.append((p1_ind, p2_ind, similarity_score))
    
print('Done')

0 -- 0.0%
10 -- 0.03%
20 -- 0.07%
30 -- 0.1%
40 -- 0.13%
50 -- 0.17%
60 -- 0.2%
70 -- 0.23%
80 -- 0.27%
90 -- 0.3%
100 -- 0.33%
110 -- 0.37%
120 -- 0.4%
130 -- 0.43%
140 -- 0.46%
150 -- 0.5%
160 -- 0.53%
170 -- 0.56%
180 -- 0.6%
190 -- 0.63%
200 -- 0.66%
210 -- 0.7%
220 -- 0.73%
230 -- 0.76%
240 -- 0.8%
250 -- 0.83%
260 -- 0.86%
270 -- 0.9%
280 -- 0.93%
290 -- 0.96%
300 -- 1.0%
310 -- 1.03%
320 -- 1.06%
330 -- 1.1%
340 -- 1.13%
350 -- 1.16%
360 -- 1.19%
370 -- 1.23%
380 -- 1.26%
390 -- 1.29%
400 -- 1.33%
410 -- 1.36%
420 -- 1.39%
430 -- 1.43%
440 -- 1.46%
450 -- 1.49%
460 -- 1.53%
470 -- 1.56%
480 -- 1.59%
490 -- 1.63%
500 -- 1.66%
510 -- 1.69%
520 -- 1.73%
530 -- 1.76%
540 -- 1.79%
550 -- 1.83%
560 -- 1.86%
570 -- 1.89%
580 -- 1.92%
590 -- 1.96%
600 -- 1.99%
610 -- 2.02%
620 -- 2.06%
630 -- 2.09%
640 -- 2.12%
650 -- 2.16%
660 -- 2.19%
670 -- 2.22%
680 -- 2.26%
690 -- 2.29%
700 -- 2.32%
710 -- 2.36%
720 -- 2.39%
730 -- 2.42%
740 -- 2.46%
750 -- 2.49%
760 -- 2.52%
770 -- 2.56%
780 -- 2.

5850 -- 19.41%
5860 -- 19.45%
5870 -- 19.48%
5880 -- 19.51%
5890 -- 19.54%
5900 -- 19.58%
5910 -- 19.61%
5920 -- 19.64%
5930 -- 19.68%
5940 -- 19.71%
5950 -- 19.74%
5960 -- 19.78%
5970 -- 19.81%
5980 -- 19.84%
5990 -- 19.88%
6000 -- 19.91%
6010 -- 19.94%
6020 -- 19.98%
6030 -- 20.01%
6040 -- 20.04%
6050 -- 20.08%
6060 -- 20.11%
6070 -- 20.14%
6080 -- 20.18%
6090 -- 20.21%
6100 -- 20.24%
6110 -- 20.27%
6120 -- 20.31%
6130 -- 20.34%
6140 -- 20.37%
6150 -- 20.41%
6160 -- 20.44%
6170 -- 20.47%
6180 -- 20.51%
6190 -- 20.54%
6200 -- 20.57%
6210 -- 20.61%
6220 -- 20.64%
6230 -- 20.67%
6240 -- 20.71%
6250 -- 20.74%
6260 -- 20.77%
6270 -- 20.81%
6280 -- 20.84%
6290 -- 20.87%
6300 -- 20.91%
6310 -- 20.94%
6320 -- 20.97%
6330 -- 21.0%
6340 -- 21.04%
6350 -- 21.07%
6360 -- 21.1%
6370 -- 21.14%
6380 -- 21.17%
6390 -- 21.2%
6400 -- 21.24%
6410 -- 21.27%
6420 -- 21.3%
6430 -- 21.34%
6440 -- 21.37%
6450 -- 21.4%
6460 -- 21.44%
6470 -- 21.47%
6480 -- 21.5%
6490 -- 21.54%
6500 -- 21.57%
6510 -- 21.6%
65

11360 -- 37.7%
11370 -- 37.73%
11380 -- 37.76%
11390 -- 37.8%
11400 -- 37.83%
11410 -- 37.86%
11420 -- 37.89%
11430 -- 37.93%
11440 -- 37.96%
11450 -- 37.99%
11460 -- 38.03%
11470 -- 38.06%
11480 -- 38.09%
11490 -- 38.13%
11500 -- 38.16%
11510 -- 38.19%
11520 -- 38.23%
11530 -- 38.26%
11540 -- 38.29%
11550 -- 38.33%
11560 -- 38.36%
11570 -- 38.39%
11580 -- 38.43%
11590 -- 38.46%
11600 -- 38.49%
11610 -- 38.53%
11620 -- 38.56%
11630 -- 38.59%
11640 -- 38.62%
11650 -- 38.66%
11660 -- 38.69%
11670 -- 38.72%
11680 -- 38.76%
11690 -- 38.79%
11700 -- 38.82%
11710 -- 38.86%
11720 -- 38.89%
11730 -- 38.92%
11740 -- 38.96%
11750 -- 38.99%
11760 -- 39.02%
11770 -- 39.06%
11780 -- 39.09%
11790 -- 39.12%
11800 -- 39.16%
11810 -- 39.19%
11820 -- 39.22%
11830 -- 39.26%
11840 -- 39.29%
11850 -- 39.32%
11860 -- 39.35%
11870 -- 39.39%
11880 -- 39.42%
11890 -- 39.45%
11900 -- 39.49%
11910 -- 39.52%
11920 -- 39.55%
11930 -- 39.59%
11940 -- 39.62%
11950 -- 39.65%
11960 -- 39.69%
11970 -- 39.72%
11980 -- 3

16510 -- 54.78%
16520 -- 54.82%
16530 -- 54.85%
16540 -- 54.88%
16550 -- 54.92%
16560 -- 54.95%
16570 -- 54.98%
16580 -- 55.02%
16590 -- 55.05%
16600 -- 55.08%
16610 -- 55.12%
16620 -- 55.15%
16630 -- 55.18%
16640 -- 55.22%
16650 -- 55.25%
16660 -- 55.28%
16670 -- 55.32%
16680 -- 55.35%
16690 -- 55.38%
16700 -- 55.42%
16710 -- 55.45%
16720 -- 55.48%
16730 -- 55.51%
16740 -- 55.55%
16750 -- 55.58%
16760 -- 55.61%
16770 -- 55.65%
16780 -- 55.68%
16790 -- 55.71%
16800 -- 55.75%
16810 -- 55.78%
16820 -- 55.81%
16830 -- 55.85%
16840 -- 55.88%
16850 -- 55.91%
16860 -- 55.95%
16870 -- 55.98%
16880 -- 56.01%
16890 -- 56.05%
16900 -- 56.08%
16910 -- 56.11%
16920 -- 56.15%
16930 -- 56.18%
16940 -- 56.21%
16950 -- 56.25%
16960 -- 56.28%
16970 -- 56.31%
16980 -- 56.34%
16990 -- 56.38%
17000 -- 56.41%
17010 -- 56.44%
17020 -- 56.48%
17030 -- 56.51%
17040 -- 56.54%
17050 -- 56.58%
17060 -- 56.61%
17070 -- 56.64%
17080 -- 56.68%
17090 -- 56.71%
17100 -- 56.74%
17110 -- 56.78%
17120 -- 56.81%
17130 --

21710 -- 72.04%
21720 -- 72.07%
21730 -- 72.11%
21740 -- 72.14%
21750 -- 72.17%
21760 -- 72.21%
21770 -- 72.24%
21780 -- 72.27%
21790 -- 72.31%
21800 -- 72.34%
21810 -- 72.37%
21820 -- 72.41%
21830 -- 72.44%
21840 -- 72.47%
21850 -- 72.5%
21860 -- 72.54%
21870 -- 72.57%
21880 -- 72.6%
21890 -- 72.64%
21900 -- 72.67%
21910 -- 72.7%
21920 -- 72.74%
21930 -- 72.77%
21940 -- 72.8%
21950 -- 72.84%
21960 -- 72.87%
21970 -- 72.9%
21980 -- 72.94%
21990 -- 72.97%
22000 -- 73.0%
22010 -- 73.04%
22020 -- 73.07%
22030 -- 73.1%
22040 -- 73.14%
22050 -- 73.17%
22060 -- 73.2%
22070 -- 73.23%
22080 -- 73.27%
22090 -- 73.3%
22100 -- 73.33%
22110 -- 73.37%
22120 -- 73.4%
22130 -- 73.43%
22140 -- 73.47%
22150 -- 73.5%
22160 -- 73.53%
22170 -- 73.57%
22180 -- 73.6%
22190 -- 73.63%
22200 -- 73.67%
22210 -- 73.7%
22220 -- 73.73%
22230 -- 73.77%
22240 -- 73.8%
22250 -- 73.83%
22260 -- 73.87%
22270 -- 73.9%
22280 -- 73.93%
22290 -- 73.96%
22300 -- 74.0%
22310 -- 74.03%
22320 -- 74.06%
22330 -- 74.1%
22340 -- 

26930 -- 89.36%
26940 -- 89.39%
26950 -- 89.43%
26960 -- 89.46%
26970 -- 89.49%
26980 -- 89.53%
26990 -- 89.56%
27000 -- 89.59%
27010 -- 89.63%
27020 -- 89.66%
27030 -- 89.69%
27040 -- 89.73%
27050 -- 89.76%
27060 -- 89.79%
27070 -- 89.83%
27080 -- 89.86%
27090 -- 89.89%
27100 -- 89.93%
27110 -- 89.96%
27120 -- 89.99%
27130 -- 90.03%
27140 -- 90.06%
27150 -- 90.09%
27160 -- 90.12%
27170 -- 90.16%
27180 -- 90.19%
27190 -- 90.22%
27200 -- 90.26%
27210 -- 90.29%
27220 -- 90.32%
27230 -- 90.36%
27240 -- 90.39%
27250 -- 90.42%
27260 -- 90.46%
27270 -- 90.49%
27280 -- 90.52%
27290 -- 90.56%
27300 -- 90.59%
27310 -- 90.62%
27320 -- 90.66%
27330 -- 90.69%
27340 -- 90.72%
27350 -- 90.76%
27360 -- 90.79%
27370 -- 90.82%
27380 -- 90.85%
27390 -- 90.89%
27400 -- 90.92%
27410 -- 90.95%
27420 -- 90.99%
27430 -- 91.02%
27440 -- 91.05%
27450 -- 91.09%
27460 -- 91.12%
27470 -- 91.15%
27480 -- 91.19%
27490 -- 91.22%
27500 -- 91.25%
27510 -- 91.29%
27520 -- 91.32%
27530 -- 91.35%
27540 -- 91.39%
27550 --

In [5]:
similarity_scores

[('0', '1', 1.8784560524749474),
 ('0', '10', 0.6638867906477884),
 ('0', '100', 1.349734159845888),
 ('0', '101', 1.747209533431097),
 ('0', '102', 1.0012761580707712),
 ('0', '103', 0.18032989882318695),
 ('0', '104', 1.2843865797038805),
 ('0', '105', 0.9568718410823089),
 ('0', '106', 2.594673401653101),
 ('0', '107', 1.1372619716591452),
 ('0', '108', 1.4977513854749582),
 ('0', '109', 0.26011469351663974),
 ('0', '11', 1.1560477572919825),
 ('0', '110', 0.5937790868178779),
 ('0', '111', 1.6346257554673513),
 ('0', '112', 0.8981956332241324),
 ('0', '113', 0.32495812726646306),
 ('0', '114', 0.926097052451069),
 ('0', '115', 1.407473183387032),
 ('0', '116', 0.9268707517140184),
 ('0', '117', 0.49902552342466233),
 ('0', '118', 0.7438462289975992),
 ('0', '119', 0.9377957684148839),
 ('0', '12', 1.615276409848484),
 ('0', '120', 0.8107906901149329),
 ('0', '121', 0.3049806567824746),
 ('0', '122', 1.6883024110963547),
 ('0', '123', 1.4741498233238337),
 ('0', '124', 1.42682314636

In [6]:
pickle.dump(similarity_scores, open('../data/final_similarity_scores.p', 'wb'))