### Workflow & Task
* Calculate the centrality measures for each node.
* Add the centrality measures to the node list and convert it into a dataframe (assign to each node).
* Select an individual chapter, calculate all nodes connected to this node, and convert into a dataframe (make sure there are at least 2 connections).
* Combine the dataframes into a new dataframe that contains the connected nodes and their centrality measures.
* Sort by one of the centrality measures to create a basic recommendation.

In [3]:
from collections import Counter
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from networkx.algorithms.centrality import betweenness_centrality

In [5]:
df = pd.read_csv('edges.csv', sep='\t')
df

Unnamed: 0,user_id,number_of_ticks,date_created,chapter,book,numberofentriesofbook,numberofusersofbook,product_manning_id,user_owns_book,book_ch,source,target
0,-2146355313,8,2019-12-14 10:05:50,chapter-1,ramamurthy,715,202,1120,1,"('ramamurthy', 'chapter-1')","('fain3', 'chapter-8')","('ramamurthy', 'chapter-1')"
1,-2146355313,8,2019-12-14 10:05:50,chapter-1,thomas,445,136,816,1,"('thomas', 'chapter-1')","('ramamurthy', 'chapter-1')","('thomas', 'chapter-1')"
2,-2145886989,10,2019-12-10 13:41:25,chapter-4,mcnamara,2533,461,880,1,"('mcnamara', 'chapter-4')","('mcnamara', 'chapter-4')","('mcnamara', 'chapter-4')"
3,-2145886989,1,2019-12-10 19:37:20,chapter-1,mcnamara,2533,461,880,1,"('mcnamara', 'chapter-1')","('mcnamara', 'chapter-4')","('mcnamara', 'chapter-1')"
4,-2145886989,5,2019-12-10 19:37:50,chapter-1,mcnamara,2533,461,880,1,"('mcnamara', 'chapter-1')","('mcnamara', 'chapter-1')","('mcnamara', 'chapter-1')"
...,...,...,...,...,...,...,...,...,...,...,...,...
4353,-2019295583,10,2019-12-07 16:09:21,chapter-1,roestenburg,344,65,486,0,"('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')"
4354,-2019295583,102,2019-12-07 16:14:21,chapter-1,roestenburg,344,65,486,0,"('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')"
4355,-2019295583,107,2019-12-07 17:51:22,chapter-1,roestenburg,344,65,486,0,"('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')"
4356,-2019295583,98,2019-12-07 20:54:58,chapter-1,roestenburg,344,65,486,0,"('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')","('roestenburg', ' chapter-1')"


In [61]:
sources, targets = df['source'].tolist(), df['target'].tolist()
edges = list(zip(sources, targets))

df['source_centrality'] = [centrality[tmp] for tmp in df['source']]
df['target_centrality'] = [centrality[tmp] for tmp in df['target']]
df.head()  

Unnamed: 0,user_id,number_of_ticks,date_created,chapter,book,numberofentriesofbook,numberofusersofbook,product_manning_id,user_owns_book,book_ch,source,target,source_centrality,target_centrality
0,-2146355313,8,2019-12-14 10:05:50,chapter-1,ramamurthy,715,202,1120,1,"('ramamurthy', 'chapter-1')","('fain3', 'chapter-8')","('ramamurthy', 'chapter-1')",0.0,5.091126e-07
1,-2146355313,8,2019-12-14 10:05:50,chapter-1,thomas,445,136,816,1,"('thomas', 'chapter-1')","('ramamurthy', 'chapter-1')","('thomas', 'chapter-1')",5.091126e-07,0.0
2,-2145886989,10,2019-12-10 13:41:25,chapter-4,mcnamara,2533,461,880,1,"('mcnamara', 'chapter-4')","('mcnamara', 'chapter-4')","('mcnamara', 'chapter-4')",0.001095101,0.001095101
3,-2145886989,1,2019-12-10 19:37:20,chapter-1,mcnamara,2533,461,880,1,"('mcnamara', 'chapter-1')","('mcnamara', 'chapter-4')","('mcnamara', 'chapter-1')",0.001095101,0.001540575
4,-2145886989,5,2019-12-10 19:37:50,chapter-1,mcnamara,2533,461,880,1,"('mcnamara', 'chapter-1')","('mcnamara', 'chapter-1')","('mcnamara', 'chapter-1')",0.001540575,0.001540575


# Find the top 10 chapters/books people start with

In [65]:
cnt = Counter()
seen = set()
for idx, row in df.iterrows():
    if row['user_id'] not in seen:
        cnt.update({row['source'] : 1})
        seen.add(row['user_id'])
cnt.most_common(10)

[("('windmill', ' chapter-2')", 6),
 ("('lauret', ' about-this-book')", 4),
 ("('richardson3', ' about-this-book')", 4),
 ("('windmill', ' chapter-1')", 4),
 ("('davis4', ' welcome')", 4),
 ("('jain', ' chapter-1')", 4),
 ("('windmill', ' welcome')", 3),
 ("('love', ' welcome')", 3),
 ("('mcnamara', ' welcome')", 3),
 ("('gustedt', ' chapter-1')", 3)]

# Choose the most common book

In [66]:
def edge_to_book(edge_str):
    """
    >>> edge_to_book("('windmill', ' chapter-2')")
    'windmill'
    """
    return edge_str.split("', '")[0].replace("('", "")

chosen_book = edge_to_book(cnt.most_common(10)[1][0])  

chosen_book

'lauret'

## Task
* Find the most common book people start with
* Find which books they most commonly transition to
* sort by centrality and use that as a recommendation

In [74]:
results=[]
cnt =Counter()
for idx, row in df.iterrows():
    source_book = edge_to_book(row['source'])
    target_book = edge_to_book(row['target'])
    # many source-target edges are links within a book;
    # which are good, but generate a self-referential recommendation
    if source_book == target_book:
        continue
    if source_book == chosen_book:
        cnt.update({(target_book, row['source'], row['target'], row['source_centrality'],row['target_centrality']):1} )
results = cnt.most_common(10)
results = sorted(results, key=lambda x:x[0][-1], reverse=True)
print( f"top book recommendations for: {chosen_book}")
results

top book recommendations for: lauret


[(('fain4',
   "('lauret', ' about-this-book')",
   "('fain4', ' chapter-1')",
   0.02976833206914405,
   0.017970803170325296),
  1),
 (('clinton3',
   "('lauret', ' chapter-1')",
   "('clinton3', ' chapter-1')",
   0.0320760256697409,
   0.014914561435989222),
  1),
 (('hurbans',
   "('lauret', ' about-this-book')",
   "('hurbans', ' chapter-1')",
   0.02976833206914405,
   0.008724122802748582),
  1),
 (('atencio3',
   "('lauret', ' about-this-book')",
   "('atencio3', ' chapter-2')",
   0.02976833206914405,
   0.006140067858335498),
  1),
 (('briggs',
   "('lauret', ' about-this-book')",
   "('briggs', ' about-this-book')",
   0.02976833206914405,
   0.003249050112318249),
  1),
 (('johnsson',
   "('lauret', ' chapter-3')",
   "('johnsson', ' chapter-6')",
   0.010123102660278613,
   0.000486711651856581),
  1),
 (('fain2',
   "('lauret', 'chapter-11')",
   "('fain2', 'about-this-book')",
   5.091126065445408e-07,
   0.0),
  1),
 (('palmer',
   "('lauret', ' chapter-1')",
   "('pal

## Source centrality - measures the quality of the current location in a book
## Target centrality - measures the quality of the recommendation, an analog of popularity and recorded importance