In [None]:
import gzip
from tqdm import tqdm
import json
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite, community, betweenness_centrality
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import datetime
import random
import birankpy as br

In [None]:
with open('all_data.json' , 'r') as f:
    data = list(map(json.loads, f))

In [None]:
with open('all_data2.json' , 'r') as f:
    data2 = list(map(json.loads, f))

### convert into dataframe

In [None]:
def to_df(data):
    rating = []
    verified = []
    user = []
    asin = []
    review = []
    summary = []
    time = []
    category = []
    for i in data:

        rating.append(i['overall'])
        verified.append(i['verified'])
        user.append(i['reviewerID'])
        asin.append(i['asin'])
        if 'reviewText' not in i:
            review.append(np.nan)
        else:
            review.append(i['reviewText'])
        if 'summary' not in i:
            summary.append(np.nan)
        else:
            summary.append(i['summary'])
        time.append(i['unixReviewTime'])
        category.append(i['category'][:-2])
    df = pd.DataFrame({
                    'rating':rating, 
                    'verified': verified, 
                    'user': user,
                    'asin':asin,
                    'review':review,
                    'summary': summary,
                    'time':time,
                    'category': category
    })
    return df
df = to_df(data)
df2 = to_df(data2)

In [None]:
df[df['review'].isnull() | df['summary'].isnull()]

In [None]:
df.sort_values(by = 'time')

In [None]:
item_in_cat = df.groupby('category')['asin'].agg(set)
item_in_cat

In [None]:
df2.sort_values(by = 'time')

In [None]:
class Simulation:
    def __init__(self, df, di = False):
        
        self.model = nx.DiGraph() if di else nx.Graph()
        self.df = df.sort_values(by = 'time')
        self.item_cat = df.groupby('asin')['category'].max()
        self.last = 0
        
        self.cat_model = nx.Graph()
        self.category = sorted(self.df['category'].unique())
        #min t: 1451692800 max t: 1538524800
        self.color = self.setup_color()
        self.pos = self.setup_pos()
    def setup_color(self):
        color_options = ['red', 'coral', 'crimson', 'cyan', 'beige', 'brown', 'chartreuse','banana', 'darkgreen',
                 'fuchsia', 'gold', 'green', 'grey', 'khaki', 'lavender', 'lime', 'olive',
                 'orange', 'orchid', 'purple', 'teal', 'wheat', 'yellow', 'aquamarine', 'goldenrod'
                
                ]
        color_options = ['xkcd:' + i for i in color_options]
        
        return {self.category[i] : color_options[i] for i in range(len(self.category))}
    
    def setup_pos(self):
        
        return {self.category[i] : i for i in range(len(self.category))}
    
    def add_node(self, t_start: int, t_end: int):
        
        df_part = self.df[(self.df['time'] >= t_start) & (self.df['time'] <= t_end)]
        self.last = df_part.iloc[-1]['time']
        '''
        user_added = set(df_part['user'])
        self.model.add_nodes_from([(node, {'category':'user'}) for node in user_added], bipartite=0)
        
        item_added = set(df_part[['asin', 'category']].itertuples(index=False, name=None))
        self.model.add_nodes_from([(node, {'category':category}) for (node, category) in item_added], bipartite=1)
        '''
        df_part['line_attr'] = df_part.apply(lambda x: {'time': x['time'], 'rating':x['rating']}, axis = 1)
        self.model.add_edges_from(df_part[['user','asin','line_attr']].itertuples(index=False, name=None))
        for i in self.model:
            if i in self.item_cat:
                self.model.nodes[i]['category'] = self.item_cat[i]
                self.model.nodes[i]['bipartite'] = 1
            else:
                self.model.nodes[i]['category'] = 'user'
                self.model.nodes[i]['bipartite'] = 0
    
    def cat_add_node(self, t_start: int, t_end: int):
        df_part = self.df[(self.df['time'] >= t_start) & (self.df['time'] <= t_end)]
        self.last = df_part.iloc[-1]['time']

        df_part['line_attr'] = df_part.apply(lambda x: {'time': x['time'], 'rating':x['rating']}, axis = 1)
        self.cat_model.add_edges_from(df_part[['user','category','line_attr']].itertuples(index=False, name=None))
        for i in self.cat_model:
            if i in self.category:
                self.cat_model.nodes[i]['bipartite'] = 1
            else:
                self.cat_model.nodes[i]['bipartite'] = 0
    
        
    def get_cat_sets(self) -> tuple:
        top_nodes = {n for n, d in self.cat_model.nodes(data=True) if d["bipartite"] == 0}

        return bipartite.sets(self.cat_model, top_nodes)
        
    def get_sets(self) -> tuple:

        top_nodes = {n for n, d in self.model.nodes(data=True) if d["bipartite"] == 0}
        #bottom_nodes = set(self.model) - top_nodes
        return bipartite.sets(self.model, top_nodes)

    
    def get_cat_color_pos(self, model = None):
            
        color_map = []
        pos = {}

    
        
        for i in self.cat_model:
            cat = self.cat_model.nodes[i]['bipartite']
    
            if  cat == 0:
                color_map.append('xkcd:blue')
                height = (list(dict(self.cat_model[i]).values())[0]['time'] - 1451692800) / (4*864000)
                pos[i] = [np.random.uniform(0,1), np.random.uniform(height-0.5, height + 0.5)]
                
            else:
                color_map.append(self.color[i])
                pos[i] = [5, self.pos[i] * 1.1]
        return color_map, pos

            
    def get_color_pos(self, model = None):
        color_map = []
        pos = {}
        
        if model != None:
            for i in model:
                cat = model.nodes[i]['category']

                if  cat == 'user':
                    color_map.append('xkcd:blue')              

                else:
                    color_map.append(self.color[cat])

            return color_map
        
        a = iter(self.model)
        a1 = next(a)
        a2 = next(a)
        height = (self.model[a1][a2]['time'] - 1451692800) / (4*864000)

        for i in self.model:
            cat = self.model.nodes[i]['category']
            
            if cat == 'user':
                last = i
            else:
                if i in self.model[last]:
                    height = (self.model[last][i]['time'] - 1451692800) / (4*864000)
                
            if  cat == 'user':
                color_map.append('xkcd:blue')
                pos[i] = (np.random.uniform(0,1), np.random.uniform(height-0.5, height + 0.5))
            else:
                color_map.append(self.color[cat])
                pos[i] = (np.random.uniform(4,5), np.random.uniform(height-0.5, height + 0.5))
            
        return color_map, pos

    def draw(self, fig_size = (12,30), node_size = 2, width = 0.09, save = False, labels=None):
        
        plt.figure(figsize = fig_size)
        print(1)
        color_map, pos = self.get_color_pos()
        print(2)
        if node_size:
            print(3)
            nx.draw(self.model, node_size = node_size, pos = pos, width = width, node_color=color_map, labels = labels)
            print(4)
        else:
            print(5)
            d = dict(self.model.degree)
            print(6)
            nx.draw(self.model, node_size = [v**0.3 for v in d.values()], pos = pos, width = width, node_color=color_map, labels = labels)
            print(7)
        
        if save:
            latest = self.last
            
            dt = datetime.date.fromtimestamp(latest)
            plt.title("Until: {}".format(dt))
            plt.savefig('./plots/Amazon{}.png'.format(dt), bbox_inches='tight')
        print('end')
        '''nx.draw_networkx(
        self.model,
        pos = nx.drawing.layout.bipartite_layout(self.model, self.get_sets()[0]),
        node_size = 3,
        with_labels=False,
        width = 0.03)''' # Or whatever other display options you like
    def draw_sub(self, nodes = [], fig_size = (10,10), width = 0.8, node_size = 20, with_labels = False, labels = None,layout = nx.fruchterman_reingold_layout, **kwds):

        sub = nx.subgraph(self.model, nodes)
        color_map = self.get_color_pos(model = sub)

        pos = layout(sub, **kwds)
        if labels:
            labels = {nodes[0]:nodes[0]}
        plt.figure(figsize = fig_size)
        if node_size:
            nx.draw(sub, node_size = node_size, width = width, node_color=color_map, with_labels = with_labels, labels = labels, pos = pos, **kwds)
        else:
            d = dict(sub.degree)
            nx.draw(sub, node_size =[80*v**0.5 for v in d.values()], width = width, node_color=color_map, with_labels = with_labels, labels = labels, pos = pos, **kwds)
        plt.legend()
    def draw_cat(self, fig_size = (20,20), width = 0.004, save = False):
        plt.figure(figsize = fig_size)
        
        d = dict(self.cat_model.degree)
        color_map, pos = self.get_cat_color_pos()
        #pos = nx.drawing.layout.bipartite_layout(self.cat_model, self.get_cat_sets()[0])
        nx.draw_networkx(
        self.cat_model,
        pos = pos,
        node_size = [v/5 for v in d.values()],
        with_labels=False,
        width = width,
        node_color=color_map)
        
        labels = {}   
        for i in self.category:
            if i in pos:
                pos[i][0] += 0.2
                labels[i] = i
        
        nx.draw_networkx_labels(self.cat_model,pos,labels,font_size=10,font_color='r')
        if save:
            latest = self.last
            
            dt = datetime.date.fromtimestamp(latest)
            plt.title("Until: {}".format(dt))
            plt.savefig('./plots/Amazon_cat{}.png'.format(dt), bbox_inches='tight')
            return
        plt.show()
    def get_projection(self, s: int,graph = 0) -> nx.classes.graph.Graph:
        if graph == 0:
            bi_sets = self.get_sets()
        
            return bipartite.projected_graph(self.model, bi_sets[s])
        else:
            bi_sets = self.get_cat_sets()
            return bipartite.projected_graph(self.cat_model, bi_sets[s])
        
    

In [None]:
if 4:
    print(2)
else

In [None]:

#s = Simulation(df[['rating','user','asin','time','category']], di = False)
#s.add_node(0, 1698524800)

s2 = Simulation(df2[['rating','user','asin','time','category']], di = False)
s2.model.nodes()
s2.add_node(0, 1698524800)

## Draw

In [None]:

#labels = {i:i for i in labels}
s2.draw(width = 0.01, labels = None, node_size = 2)

In [None]:
for i in s2.model['B01FAHYO0O']:
    print(i)

In [None]:

def get_list(asin, ifm = False):
    nodes = []
    nodes.append(asin)
    # i = users
    for i in s2.model[asin]:
        if i not in nodes:
            nodes.append(i)
        # j = items
        for j in s2.model[i]:
            if j not in nodes:
                nodes.append(j)
            # k = users
            for k in s2.model[j]:
                if k not in nodes:
                    nodes.append(k)
                
                # m = items
                if ifm:
                    for m in s2.model[k]:
                        if m not in nodes:
                            nodes.append(m)
    return nodes

        

In [None]:
print(nx.fruchterman_reingold_layout)

In [None]:
nodes = get_list('B01FAHYO0O', ifm = False)
s2.draw_sub(nodes, node_size = None, fig_size = (8,8), with_labels = True,labels = True, layout = nx.spring_layout)


In [None]:
nodes = get_list('B013L4ZYIY', ifm = False)
s2.draw_sub(nodes, node_size = 0, fig_size = (8,8), with_labels = True,labels = True)


In [None]:
nodes = get_list('B001IANIDM', ifm = False)
s2.draw_sub(nodes, node_size = 0, fig_size = (8,8), with_labels = True,labels = True)


In [None]:
df2[df2['asin'] == 'B01FAHYO0O']

In [None]:
df2[df2['user'] == 'A21A7LZUGD9OXR']['category'].value_counts()

## Category increase GIF

In [None]:
'''
s4 = Simulation(df2[['rating','user','asin','time','category']], di = False)
s4.model.nodes()
#s4.add_node(1527836400, 1698524800)
s4.cat_add_node(0, 1698524800)
'''

In [None]:
from zipfile import ZipFile
import os
zipObj = ZipFile('gif.zip', 'w')

for filename in os.listdir('./plots'):
    zipObj.write('./plots/' + filename)
zipObj.close()

In [None]:
df2[df2['category'] == 'Appliances']

## Get separate part

In [None]:
s.model.edges()

In [None]:
df[['rating','user','asin','time','category']][:10000][df[['rating','user','asin','time','category']][:10000]['asin'] == 'B000HCLLMM']


## Clustering Coefficient Problems

### Why important?
Clustering coefficient is an important feature of the network which can help us understand the overall structure of the netork, how dense nodes are related. \
One intuition applying clustering coefficient in our network might be this: imagine a case with high clustering coefficient for node A. Items bought by the users who bought A were also bought by other users who bought these items. In simple words, a group of customers are buying this group of items.
To exemplify this, I randomly choose a node with high clustering coefficient of 0.9. We can see the product is about some sewing and hand craft tools. Then when we check all other items the user bought, we can find they are from similar categories. To save time, we won't look at them one-by-one, but it's reasonable to assume that people who buy one of these items would probably buy another. 
Here is an random example of a low clustering coefficient of around 0.1. Based on our intuition, there is no specific label that we could confidently put on this purchase. People who bought this pants would probably buy other very different stuff. 
Here is an random example of a clustering coefficient of around 0.5. My intuition tells me it makes sense because we could put more specific labels on people who buy a new bed. I could guess they might also buy a new pillow or a new sheet.
Therefore, one use of clustering coefficient in our case could be a reflector of nicheness. Whether a product is facing universal customers or some specific target group.

The definition of the clustering coefficient cannot be extended to bipartite networks, because the connection of direct neighbors of the same type is prohibited, and triangles cannot exist.
# $c_u = \frac{2 T(u)}{deg(u)(deg(u)-1)}$
Here the $T(u)$ is interpreted as the number of links between neighbors of node u. And it's the same as the triangles mentioned above

To deal with this problem, we turn to two possible solutions:
1. Measure the clustering coefficient of projections separately. #####picture \
However, there are also some problems with projection.

### Projection Problems
1. Information in the bipartite structure may disappear after projection. 
For instance, the fact that two items brought by multiple people. In projected graph, the two items are
simply linked together. However, we could probably solve this problem using a weighted bipartite graph (increase the weight if more people buy both items) 
The fact that there are many bipartite graphs which lead to the same projection can also show this loss of information
2. The projection couldn't capture the status other than the shared node. For example, C and D are linked because they both connect to 5. But the information of 4-C and 6-D and 7-D are not captured. \
And we will show below the information loss will make the unipartite ananlysis of clustering coefficient even less informative when the relation is sparse as in our dataset
3. Some properties of the projection may be due to the projection process rather than the underlying data itself. For instance, it is shown in Newman et al. (2001a) and Guillaume and Latapy (2004a,b) that when considering the projection of a random bipartite graph, one observes high clustering coefficients. Therefore, high clustering coefficients in projections may not be viewed as significant properties: they are consequences of the bipartite nature of the underlying two-mode network.
4. There are also some computation cost problems in large networks. We won't address this problem here.

Therefore, we need to find out another way of measuring clustering coefficient.

2. Latapy et al. (2008) propose extending the local clustering coefficient to bipartite networks with a more abstract definition of overlapping neighborhoods. The overlap equals the fraction of joint neighbors in both neighborhoods. 

(In simple words, the overlap is represented by the similarity $c_{uv}$ between two sets )

 # $c_u = \frac{\sum_{v \in N(N(u))} c_{uv} }{|N(N(u))|}$
 where $N(N(u)$) are the second order neighbors of u in G excluding u, and $c_{uv}$ is the pairwise clustering coefficient between nodes u and v. \
 The mode selects the function for $c_{uv}$ which can be:

 # $c_u = \frac{\sum_{v \in N(N(u))} c_{uv} }{|N(N(u))|}$
 where $N(N(u)$) are the second order neighbors of u in G excluding u, and $c_{uv}$ is the pairwise clustering coefficient between nodes u and v. \
 The function for $c_{uv}$ which can be:
 
# $c_{uv}=\frac{|N(u)\cap N(v)|}{|N(u) \cup N(v)|}$ 
Jaccard Similarity

# $c_u = \frac{\sum_{v \in N(N(u))} c_{uv} }{|N(N(u))|}$
 where $N(N(u)$) are the second order neighbors of u in G excluding u, and $c_{uv}$ is the pairwise clustering coefficient between nodes u and v. \
 The mode selects the function for $c_{uv}$ which can be:
 
# $c_{uv}=\frac{|N(u)\cap N(v)|}{|N(u) \cup N(v)|}$ 
Jaccard Similarity

# $c_{uv}=\frac{|N(u)\cap N(v)|}{min(|N(u)|,|N(v)|)}$
Min. which is similar to the one mentioned on lecture slides 9b page 65

# $c_u = \frac{\sum_{v \in N(N(u))} c_{uv} }{|N(N(u))|}$
 where $N(N(u)$) are the second order neighbors of u in G excluding u, and $c_{uv}$ is the pairwise clustering coefficient between nodes u and v. \
 (In simple words, the overlap $c_{uv}$ is just the similarity between two sets \
 The function for $c_{uv}$ which can be:
 
# $c_{uv}=\frac{|N(u)\cap N(v)|}{|N(u) \cup N(v)|}$ 
Jaccard Similarity

# $c_{uv}=\frac{|N(u)\cap N(v)|}{min(|N(u)|,|N(v)|)}$
Min. which is similar to the one mentioned on lecture slides 9b page 65

# $c_{uv}=\frac{|N(u)\cap N(v)|}{max(|N(u)|,|N(v)|)}$
Max

In [None]:
item_g = s.get_projection(1)
bipartite_clusterCo = bipartite.clustering(s.model, s.get_sets()[1], mode = 'dot')

In [None]:
item_g['B002YX0GEQ']

In [None]:
item_g.degree()

In [None]:
item_g.edges()

In [None]:
df2[df2['asin'] == 'B00QYOF6CS'] 

In [None]:
df2[df2['asin'] == 'B002YX0GEQ'] 

In [None]:
item_g = s2.get_projection(1)
bipartite_clusterCo = bipartite.clustering(s2.model, s2.get_sets()[1], mode = 'dot')
df_2 = pd.DataFrame({'asin':bipartite_clusterCo.keys(), 'coef': bipartite_clusterCo.values(), 'mode': ['dot']*len(bipartite_clusterCo)})
df_2.plot(kind='hist', y = 'coef')

clusterCo = nx.clustering(item_g)
df_3 = pd.DataFrame({'asin':clusterCo.keys(), 'coef': clusterCo.values(), 'mode': ['projection']*len(clusterCo) })
df_3.plot(kind='hist', y = 'coef')

item_g = s2.get_projection(1)
bipartite_clusterCo = bipartite.clustering(s2.model, s2.get_sets()[1], mode = 'min')
df_4 = pd.DataFrame({'asin':bipartite_clusterCo.keys(), 'coef': bipartite_clusterCo.values(), 'mode': ['min']*len(bipartite_clusterCo)})
df_4.plot(kind='hist', y = 'coef')

item_g = s2.get_projection(1)
bipartite_clusterCo = bipartite.clustering(s2.model, s2.get_sets()[1], mode = 'max')
df_5 = pd.DataFrame({'asin':bipartite_clusterCo.keys(), 'coef': bipartite_clusterCo.values(), 'mode': ['max']*len(bipartite_clusterCo)})
df_5.plot(kind='hist', y = 'coef')



In [None]:
df_6 = pd.concat([df_2, df_3, df_4, df_5])
df_6

In [None]:
import plotly.express as px
fig = px.histogram(df_6[df_6['mode'].isin(['projection', 'dot'])], x = 'coef', color = 'mode', opacity = 0.35, nbins =50, barmode = 'overlay', title = 'clustering coefficient histogram')
fig.show()


In [None]:
#df2[df2['asin'] == 'B01FAHYO0O'].drop('review', axis = 1)
#df2[df2['asin'] == 'B01GHLWK58'].drop('review', axis = 1)
#df2[df2['asin'] == 'B0000DJUYR'].drop('review', axis = 1)
#df2[df2['asin'] == 'B001IANIDM'].drop('review', axis = 1)
df2[df2['asin'] == 'B013L4ZYIY'].drop('review', axis = 1)

In [None]:
df2[df2['user'] == 'A3OMBQCLXIYH82'][['asin','category']]

In [None]:
df2[df2['user'] == 'ARS8GNF3HMUGG'][['asin','category']]

In [None]:
df2[df2['asin'] == 'B01FAHYO0O']['review'].iloc[0]

In [None]:
df_2[df_2['asin'] == 'B013L4ZYIY']

In [None]:
df_2[(df_2['asin'] == 0.6) & (df_2['coef'] > 0)]

In [None]:

sorted(bipartite_clusterCo.items(), key=lambda item: item[1], reverse = True)[10000]


In [None]:
df2[df2['asin'] == 'B00XRDH9QK']

In [None]:
df2[df2['user'] == 'AXIY5ZEG2SJ53']

In [None]:
df2[df2['asin'] == 'B00XUYWYU2']

## Hubs and Authorities

In [None]:
s3 = Simulation(df2[['rating','user','asin','time','category']], di = True)
s3.model.nodes()
s3.add_node(0, 1698524800)

In [None]:
h, a = nx.hits(s3.model)

In [None]:
h = dict(sorted(h.items(), key=lambda item: item[1], reverse = True))
h

In [None]:
len(a)

In [None]:
a = dict(sorted(a.items(), key=lambda item: item[1], reverse = True))
a

In [None]:
for (i, j) in a.items():
    if i[0] == 'A':
        print(i,j)

In [None]:
h['A1ZKJOISGOBVI4']

In [None]:
a = dict(sorted(a.items(), key=lambda item: item[1], reverse = True))
a

In [None]:
df2[df2['user'] == 'A1ZKJOISGOBVI4']

In [None]:
df2[df2['asin'] == 'B0096TXQNE']

In [None]:
df.groupby('asin').count().sort_values(by = 'rating')

In [None]:
temp = np.array(list(h.values()))*1e20

In [None]:
plt.hist(x = temp[temp > 0] , bins = 100)

In [None]:
s.df

## k-clique communities

In [None]:
from networkx.algorithms.community import k_clique_communities
#clique_g = nx.find_cliques(s2.model)
items = s2.get_projection(1)
clique_g = nx.find_cliques(items)
list(k_clique_communities(items, 20, clique_g))

In [None]:
temp_df = df2[df2['asin'].isin([
    'B00002ST27',
            'B00005UK88',
            'B00012D0SG',
            'B0007M1U28',
            'B0009V1BDA',
            'B000C10E1W',
            'B000CEM3M2',
            'B000GGOKN6',
            'B000IZ9S2S',
            'B000P6G74M',
            'B000YTTFWQ',
            'B001CK0F0W',
            'B001CSPUF4',
            'B001DT32AC',
            'B001TICH08',
            'B001V5C75Y',
            'B002BRZ9G0',
            'B002LGAAQK',
            'B002SG7EMQ',
            'B0032HM6JG',
            'B003B0B10Y',
            'B004XOILSM',
            'B0050SVNSU',
            'B0053AGEJS',
            'B0053B5RGI',
            'B005DJOIHE',
            'B005H3I38G',
            'B005UWAG3Y',
            'B0089YGAQM',
            'B008A20KGY',
            'B0092GSNHK',
            'B009L0OUT2',
            'B00B2HP8AW',
            'B00CIJ3T1S',
            'B00CNUH50I',
            'B00CX1YX34',
            'B00E4BISE6',
            'B00EXKZ4SQ',
            'B00F3S5QJE',
            'B00FARSAVY',
            'B00GR5ZRA0',
            'B00HH4K6TC',
            'B00HSX6R72',
            'B00HWHOFRI',
            'B00JR3OOR6',
            'B00K9VXUHU',
            'B00L0YLRUW',
            'B00LM4ON3I',
            'B00MX7UHHO',
            'B00NQ7U982',
            'B00O65H6S4',
            'B00OJ5WBUE',
            'B00PGDN3NC',
            'B00RGGVG4K',
            'B00RN08584',
            'B00SXI0M9U',
            'B00U3GXAIA',
            'B00UR5XBMW',
            'B00VE6Z5XQ',
            'B00VHB3RQ0',
            'B00VTJHRNY',
            'B00W4R1FQK',
            'B00WL68ZJ4',
            'B00X4PXLGI',
            'B00X808VE6',
            'B00X9VQXUI',
            'B00YSMSKSU',
            'B00YXDGOYQ',
            'B00ZMGSUTU',
            'B01155YKF4',
            'B013B8Y6HA',
            'B013QPFI3O',
            'B0149L17FC',
            'B014P45YWA',
            'B015S9TTDG',
            'B016F3M7OM',
            'B017KU3QTE',
            'B0185MFA18',
            'B019P8U3C2',
            'B01B609JSQ',
            'B01C2N90LW',
            'B01EHPRY70',
            'B01EZ91VV8',
            'B01FDF6HEE',
            'B01FPG5BSE',
            'B01G9MVG82',
            'B01H2XD2DY',
            'B01HCL3THA'
            ])][['asin','review', 'category']] 
temp_df.groupby('asin').max().groupby('category').count()

In [None]:
items = s2.get_projection(1, graph = 0)
clique_g = nx.find_cliques(items)
list(k_clique_communities(items, 10, clique_g))

## Louvain 

In [None]:
items = s2.get_projection(1)
items

In [None]:
cluster1 = nx.community.louvain_communities(items)
cluster1

In [None]:
list(cluster1[0])

In [None]:
temp_df = df2[df2['asin'].isin(list(cluster1[20]))][['asin','review', 'category']] 
temp_df.groupby('asin').max().groupby('category').count()

## Closeness Centrality

In [None]:
s4 = Simulation(df2[['rating','user','asin','time','category']], di = False)

#s4.add_node(1527836400, 1698524800)
s4.cat_add_node(1527836400, 1698524800)

In [None]:
project0 = s2.get_projection(0)


## betweenness Centrality

We 

In [None]:
sets = s4.get_cat_sets()
bipartite.centrality.betweenness_centrality(s4.cat_model , sets[1])

In [None]:
s4.cat_model.order()

In [None]:
#bet_cent = bipartite.centrality.betweenness_centrality(s4.model, sets[0])
#bet_cent

## Co-Hits

In [None]:
bn = br.BipartiteNetwork()

In [None]:
bn.set_edgelist(
    df2,
    top_col='user', bottom_col='asin',
    weight_col=None
)

In [None]:
user_birank_df, ssn_birank_df = bn.generate_birank(normalizer='CoHITS')
user_birank_df.sort_values(by='user_birank', ascending=False).head()


In [None]:
ssn_birank_df.sort_values(by='asin_birank', ascending=False).head()

## BiRank

In [None]:
user_birank_df2, ssn_birank_df2 = bn.generate_birank(normalizer='BiRank')
user_birank_df2.sort_values(by='user_birank', ascending=False).head()

In [None]:
ssn_birank_df2.sort_values(by='asin_birank', ascending=False).head()

## Weighted Co-Hits

In [None]:
bn_w = br.BipartiteNetwork()
bn_w.set_edgelist(
    df2,
    top_col='user', bottom_col='asin',
    weight_col='rating'
)

In [None]:
user_birank_df, ssn_birank_df = bn_w.generate_birank(normalizer='CoHITS')
user_birank_df.sort_values(by='user_birank', ascending=False).head()

## Weighted BiRank

In [None]:
user_birank_df, ssn_birank_df = bn_w.generate_birank(normalizer='BiRank')
user_birank_df.sort_values(by='user_birank', ascending=False).head()

In [None]:
ssn_birank_df.sort_values(by='asin_birank', ascending=False).head()

In [None]:
birank_dic = {row['asin']: row['asin_birank'] for index, row in ssn_birank_df.iterrows()}
birank_dic2 = {row['asin']: row['asin_birank'] for index, row in ssn_birank_df2.iterrows()}
degree_dic = df3.groupby('asin')['rating'].count().to_dict()
df3 = df2[['asin', 'user', 'rating']]
df3['cohits'] = df3['asin'].apply(lambda x: birank_dic[x])
df3['birank'] = df3['asin'].apply(lambda x: birank_dic2[x])
df3['degree'] = df3['asin'].apply(lambda x: degree_dic[x])
df3

In [None]:
df3.plot(kind='hist', y = 'birank')

In [None]:
df3.plot(kind='hist', y = 'cohits')

In [None]:
df3.plot(kind='hist', y = 'degree')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, y_train, y_test = train_test_split(df3[['degree', 'birank', 'cohits']], df3['rating'], test_size = 0.25)
reg = LinearRegression().fit(X_train[['degree']], y_train)
rf = RandomForestRegressor().fit(X_train[['degree']], y_train)
pred_reg = reg.predict(X_test[['degree']])
pred_rf = rf.predict(X_test[['degree']])

In [None]:
np.mean((pred_reg - y_test)**2), np.mean((pred_rf - y_test)**2)

In [None]:
reg = LinearRegression().fit(X_train[['birank']], y_train)
rf = RandomForestRegressor().fit(X_train[['birank']], y_train)
pred_reg = reg.predict(X_test[['birank']])
pred_rf = rf.predict(X_test[['birank']])

In [None]:
np.mean((pred_reg - y_test)**2), np.mean((pred_rf - y_test)**2)

In [None]:
reg = LinearRegression().fit(X_train[['cohits']], y_train)
rf = RandomForestRegressor().fit(X_train[['cohits']], y_train)
pred_reg = reg.predict(X_test[['cohits']])
pred_rf = rf.predict(X_test[['cohits']])

In [None]:
np.mean((pred_reg - y_test)**2), np.mean((pred_rf - y_test)**2)

In [None]:
mse = []
for i in tqdm(range(100)):
    X_train, X_test, y_train, y_test = train_test_split(df3[['degree', 'birank', 'cohits']], df3['rating'], test_size = 0.25)
    rf = RandomForestRegressor().fit(X_train[['degree']], y_train)
    pred_rf = rf.predict(X_test[['degree']])
    mse.append(np.mean((pred_rf - y_test)**2))
    

In [None]:
np.mean((1.2603352060086486 < np.array(mse)))

In [None]:
#s.model.nodes(data = True)

In [None]:
s.model['B0068AD39Q']

In [None]:
#s.draw(fig_size = (20,100), width = 0.03, save = False)

In [None]:
np.random.normal(5 * 2,2,1)

In [None]:
a = df[['user','asin']]
a.itertuples(index=False, name=None)

In [None]:
B = nx.Graph()
# Add nodes with the node attribute "bipartite"
B.add_nodes_from(set(df['user']), bipartite=0)
B.add_nodes_from(set(df['asin']), bipartite=1)
# Add edges only between nodes of opposite node sets
B.add_weighted_edges_from(df[['user','asin', 'rating']].itertuples(index=False, name=None))
nx.set_node_attributes(B, 0, "grain")

## example 

Piepenbrink, Anke and Ajai Gaur (2013). Methodological Advances in the Analysis of
Bipartite Networks: An Illustration Using Board Interlocks in Indian Firms. SSRN
Scholarly Paper ID 2199111. Rochester, NY: Social Science Research Network. url:
http://papers.ssrn.com/abstract=2199111.

Latapy, Matthieu, Cl´emence Magnien, and Nathalie Vecchio (2008). “Basic notions for
the analysis of large two-mode networks”. In: Social Networks 30.1, pp. 31–48. issn:
03788733. doi: 10.1016/j.socnet.2007.04.006.