# Pandas dataframe accessor: Levi graph
## Transform between different graph data repressentations via a Levi graph

In [3]:
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
import matplotlib.pyplot as plt
import beartype as bt

plt.rcParams['figure.dpi'] = 60

### Create data to use in development:
- Token Co-occurrence matrix
- Document-term matrix (this is an example of a bipartite structure/incidence matrix)
- COO (format: multi-index pandas Series)


In [4]:
# Create sample dataset

doc_ids = [str(x) for x in range(5)]
texts = [
    "the quick brown fox jumped",
    "the fox jumped",
    "the dog jumped",
    "one brown fox",
    "ten brown dogs",
   ]

df = pd.DataFrame({'doc_id': doc_ids,
                    'text':texts,
                    })

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words=['the'])
cv.fit(df['text'])
tokens = cv.get_feature_names_out()  #alphabetical

In [6]:
# Create cooccurence matrix

results = cv.transform(df['text'])
coocc = results.T.dot(results)
# np.fill_diagonal(coocc.values, 0)
coocc = pd.DataFrame(results.T.dot(results).toarray(), index=tokens, columns=tokens)
np.fill_diagonal(coocc.values, 0)
coocc

Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten
brown,0,0,1,2,1,1,1,1
dog,0,0,0,0,1,0,0,0
dogs,1,0,0,0,0,0,0,1
fox,2,0,0,0,2,1,1,0
jumped,1,1,0,2,0,0,1,0
one,1,0,0,1,0,0,0,0
quick,1,0,0,1,1,0,0,0
ten,1,0,1,0,0,0,0,0


In [7]:
# Create document term matrix (BIPARTITE STRUCTURE)

results = cv.transform(df['text'])
features = cv.get_feature_names_out()
doc_term = pd.DataFrame(results.toarray(), columns=features)
doc_term = doc_term.reindex(columns=doc_term.columns.tolist()+['cat', 'bear', 'tree'], fill_value=0)
doc_term

Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten,cat,bear,tree
0,1,0,0,1,1,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,0
4,1,0,1,0,0,0,0,1,0,0,0


In [8]:
# Create COO Dataframe
# using scipy sparse module here, but we want to avoid it in production
coo_matrix = sparse.coo_matrix(doc_term.values)
coo_rows = coo_matrix.row
coo_cols = coo_matrix.col
coo_vals = coo_matrix.data

term_dict = dict(zip(range(0,11), doc_term.columns))
term_dict

{0: 'brown',
 1: 'dog',
 2: 'dogs',
 3: 'fox',
 4: 'jumped',
 5: 'one',
 6: 'quick',
 7: 'ten',
 8: 'cat',
 9: 'bear',
 10: 'tree'}

In [9]:
coo_cols = coo_cols.tolist()
coo_col_names = [term_dict[key] for key in coo_cols]

In [10]:
idx=pd.MultiIndex.from_arrays([coo_matrix.row, coo_col_names])
coo = pd.DataFrame(coo_matrix.data, index=idx, columns=["flag"])
coo

Unnamed: 0,Unnamed: 1,flag
0,brown,1
0,fox,1
0,jumped,1
0,quick,1
1,fox,1
1,jumped,1
2,dog,1
2,jumped,1
3,brown,1
3,fox,1


In [11]:
edgelist = coo.reset_index(names=["doc", "term"])
edgelist

Unnamed: 0,doc,term,flag
0,0,brown,1
1,0,fox,1
2,0,jumped,1
3,0,quick,1
4,1,fox,1
5,1,jumped,1
6,2,dog,1
7,2,jumped,1
8,3,brown,1
9,3,fox,1


In [12]:
coo.squeeze()

0  brown     1
   fox       1
   jumped    1
   quick     1
1  fox       1
   jumped    1
2  dog       1
   jumped    1
3  brown     1
   fox       1
   one       1
4  brown     1
   dogs      1
   ten       1
Name: flag, dtype: int64

In [13]:
print(coo_matrix)  #scipy sparse format

  (0, 0)	1
  (0, 3)	1
  (0, 4)	1
  (0, 6)	1
  (1, 3)	1
  (1, 4)	1
  (2, 1)	1
  (2, 4)	1
  (3, 0)	1
  (3, 3)	1
  (3, 5)	1
  (4, 0)	1
  (4, 2)	1
  (4, 7)	1


In [14]:
def edgelist_to_incidence(edgelist, node_colname, value_colname=None):
    """assume edgelist is indexed by edge number, not some edge set of names (for now)"""
    data = np.ones_like(edgelist.index.values) if value_colname == None else edgelist[value_colname].values
    return sparse.coo_array((data, (edgelist.index, edgelist[node_colname].cat.codes)), shape=(edgelist.shape[0], len(edgelist[node_colname].cat.categories)))  # coo_array((data, (row_idx, col_idx)))

In [15]:
edgelist.dtypes

doc      int64
term    object
flag     int64
dtype: object

In [16]:
edgelist['doc'] = pd.Categorical(edgelist.doc)
edgelist['term'] = pd.Categorical(edgelist.term)

In [17]:
edgelist

Unnamed: 0,doc,term,flag
0,0,brown,1
1,0,fox,1
2,0,jumped,1
3,0,quick,1
4,1,fox,1
5,1,jumped,1
6,2,dog,1
7,2,jumped,1
8,3,brown,1
9,3,fox,1


In [18]:
inc = edgelist_to_incidence(edgelist, node_colname='term')
print(inc)

  (0, 0)	1
  (1, 3)	1
  (2, 4)	1
  (3, 6)	1
  (4, 3)	1
  (5, 4)	1
  (6, 1)	1
  (7, 4)	1
  (8, 0)	1
  (9, 3)	1
  (10, 5)	1
  (11, 0)	1
  (12, 2)	1
  (13, 7)	1


In [19]:
inc_frame = pd.DataFrame(inc.data, index=edgelist)
inc_frame

Unnamed: 0,0
"(0, brown, 1)",1
"(0, fox, 1)",1
"(0, jumped, 1)",1
"(0, quick, 1)",1
"(1, fox, 1)",1
"(1, jumped, 1)",1
"(2, dog, 1)",1
"(2, jumped, 1)",1
"(3, brown, 1)",1
"(3, fox, 1)",1


In [20]:
from levi import dataframe

doc_term.levi.test_attr

  from tqdm.autonotebook import tqdm


brown     1
dog       0
dogs      0
fox       1
jumped    1
one       0
quick     1
ten       0
cat       0
bear      0
tree      0
Name: 0, dtype: int64

In [21]:
# tuple_list = list(zip(coo_rows, coo_col_names, coo_matrix.data))
tuple_list = list(zip(coo_rows, coo_col_names, ))
tuple_list_w_data = [(*t, 1) for t in tuple_list]

In [22]:
import itertools
row_index = dict(zip(tuple_list_w_data, itertools.count()))
row_index

{(0, 'brown', 1): 0,
 (0, 'fox', 1): 1,
 (0, 'jumped', 1): 2,
 (0, 'quick', 1): 3,
 (1, 'fox', 1): 4,
 (1, 'jumped', 1): 5,
 (2, 'dog', 1): 6,
 (2, 'jumped', 1): 7,
 (3, 'brown', 1): 8,
 (3, 'fox', 1): 9,
 (3, 'one', 1): 10,
 (4, 'brown', 1): 11,
 (4, 'dogs', 1): 12,
 (4, 'ten', 1): 13}

In [23]:
# array_x = nx.algorithms.bipartite.biadjacency_matrix(graph, row_order=tuple_list_w_data)
# array_x

In [24]:
# Make sure pandas categorical works with ints
#   - in nb, ss example has index as ints, not categoricals
#   - pandas_categoricals

In [25]:
from phantom import Phantom
from phantom.predicates import Predicate
import beartype.typing as bt
from beartype.door import is_bearable


In [26]:
def of_beartype(t: bt.Union[type, bt.Tuple[type, ...]]) -> Predicate[object]:
    """ From `phantom.predicates.generic.of_type`

    Create a new predicate that succeeds when its argument is bearable on ``t``.
    """

    def check(a: object) -> bool:
        return is_bearable(a, t)

    return check

In [27]:
# chck = of_beartype(coo)
# chck(coo)

In [28]:
coo.unstack(level=1, )

Unnamed: 0_level_0,flag,flag,flag,flag,flag,flag,flag,flag
Unnamed: 0_level_1,brown,dog,dogs,fox,jumped,one,quick,ten
0,1.0,,,1.0,1.0,,1.0,
1,,,,1.0,1.0,,,
2,,1.0,,,1.0,,,
3,1.0,,,1.0,,1.0,,
4,1.0,,1.0,,,,,1.0


In [29]:
type(coocc)

pandas.core.frame.DataFrame

In [30]:
import levi

coo.levi

<levi.dataframe.LeviAccessor at 0x7fe038539420>

In [31]:
def affinity_to_edge(source_name,  # type: str
    target_name,  # type: str
    affinity,  # type: DF
    value_name=None  # type: Optional[str]
    ):
# type: (...) -> DF
    """DEPRECATED"""
    return (affinity.reset_index().melt(source_name, value_name=value_name, var_name=target_name).query('weight>0'))


def biadjacency_to_edgelist(affinity, value_name='weight'):
    """assumes affinity/biadj. matrix with named, categorical dtype col/idx"""
    return (affinity.melt(ignore_index=False, value_name=value_name).reset_index().astype(dict(((i.name), (i.dtype)) for i in (affinity.index, affinity.columns))))


def edgelist_to_incidence(edgelist, node_colname, value_colname=None):
    """assume edgelist is indexed by edge number, not some edge set of names (for now)"""
    data = np.ones_like(edgelist.index.values) if value_colname == None else edgelist[value_colname].values
    return coo_array((data, (edgelist.index, edgelist[node_colname].cat.codes)), shape=(edgelist.shape[0], len(edgelist[node_colname].cat.categories)))  # coo_array((data, (row_idx, col_idx)))


def n_to_m_assignment(affinity, max_review, max_assign=None, bp_value_name='weight'):

    n_idx, n_col = affinity.shape
    max_assign = ((ident)((math.ceil)(n_idx * max_review / n_col), side_effect=lambda _=None: print('max assigments per reviewer: {_coconut_format_0}'.format(_coconut_format_0=(_))))) if max_assign is None else max_assign

    edgelist = (biadjacency_to_edgelist)(affinity, value_name=bp_value_name)
    Eidx = edgelist_to_incidence(edgelist, affinity.index.name).T
    Ecol = edgelist_to_incidence(edgelist, affinity.columns.name).T

    K = vstack((Eidx, Ecol))
    m = K.sum(axis=0) + 1e-3 * np.random.rand(edgelist.shape[0])
    d = np.hstack((np.ones((n_idx)) * max_review, np.ones((n_col)) * max_assign))
    return linprog(-m, A_ub=K, b_ub=d, bounds=(0., 1.), options=dict(disp=True))


def edge_to_affinity(source_name,  # type: str
    target_name,  # type: str
    edgelist,  # type: DF
    value_name=None  # type: Optional[str]
    ):
# type: (...) -> DF
    """DEPRECATED"""
    return (edgelist.pivot(index=source_name, columns=target_name, values=value_name).reindex(columns=edgelist[target_name].unique(), index=edgelist[source_name].unique()).astype(float).fillna(0))


# ## TODO ###


def edge_to_bp(source_name,  # type: str
    target_name,  # type: str
    edgelist,  # type: DF
    value_name=None  # type: Optional[str]
    ):
# type: (...) -> nx.Graph
    G = nx.from_pandas_edgelist(edgelist, source=source_name, target=target_name, edge_attr=value_name)
    source_partition = edgelist[source_name].unique().tolist()
    target_partition = edgelist[target_name].unique().tolist()
    nx.set_node_attributes(G, dup_zip(source_partition, {'bipartite': 0.}))
    nx.set_node_attributes(G, dup_zip(target_partition, {'bipartite': 1.}))
    return G


def incidence(source_name,  # type: str
    target_name,  # type: str
    G,  # type: nx.Graph
    edgelist,  # type: DF
    affinity,  # type: DF
    value_name=None  # type: _coconut.typing.Optional[str]
    ):
# type: (...) -> np.ndarray
    return nx.incidence_matrix(G, edgelist=(list)(edgelist[[source_name, target_name]].itertuples(index=False, name=None)), nodelist=affinity.index.to_list() + affinity.columns.to_list())


For now, assume "input" is a doc-term matrix

In [32]:
# from affinity-to-edge

def affinity_to_edge(source_name,  # type: str
    target_name,  # type: str
    affinity,  # type: DF
    value_name=None  # type: Optional[str]
    ):
# type: (...) -> DF
    """DEPRECATED"""
    return (affinity.reset_index().melt(source_name, value_name=value_name, var_name=target_name).query('weight>0'))


In [33]:
doc_term

Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten,cat,bear,tree
0,1,0,0,1,1,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,0
4,1,0,1,0,0,0,0,1,0,0,0


In [34]:
ids = ('id-student', 'id-project')

testing = doc_term.reset_index().melt(0, value_name="brown", ).query('weight>0')   #var_name=target_name
testing

  testing = doc_term.reset_index().melt(0, value_name="brown", ).query('weight>0')   #var_name=target_name


KeyError: 0

In [None]:
pd.DataFrame.melt

testing = doc_term.reset_index().melt(source_name, value_name=value_name, var_name=target_name).query('weight>0')
testing

NameError: name 'source_name' is not defined

In [None]:
# @pd.api.extensions.register_dataframe_accessor("levi")   #can also be df, series, or index
# class LeviAccessor:
#     def __init__(self, pandas_obj):
#         self._validate(pandas_obj)
#         self._obj = pandas_obj

#     @staticmethod
#     def _validate(obj):
#         # TODO: use beartype
#         if type(obj) is not pd.DataFrame:
#             raise AttributeError("Must be cooccurrence Dataframe")  #FIXME this is just filler to get accesor to run, need to update
        

#     @property
#     def from_dataframe(self):
#         test = self._obj()
#         return 


  

#     @property
#     def test_attr(self):        
#         return self._obj.iloc[0]
    
#     @property
#     def incidence_matrix(self):
#         return self._obj.unstack(level=1, fill_value=0)
    
    
    

    # def testing(self):
    #     # plot this array's data on a map, e.g., using Cartopy
    #     print(self.__str__)
    #     pass

In [None]:
coo.levi.test_attr

flag    1
Name: (0, brown), dtype: int64

In [36]:

coo.levi.to_biadjacency

AttributeError: 'LeviAccessor' object has no attribute 'to_biadjacency'

In [None]:
# @pd.api.extensions.register_series_accessor("levi")   
# class LeviAccessor:
#     def __init__(self, pandas_obj):
#         self._validate(pandas_obj)
#         self._obj = pandas_obj

#     @staticmethod
#     def _validate(obj):
#         # TODO: use beartype
#         # verify there is a column latitude and a column longitude
#         if len(obj.values) < 0:
#             raise AttributeError("Must have single column of flags")  #fixme: this is just filler to get accesor to run, need to update
#     #     if "latitude" not in obj.columns or "longitude" not in obj.columns:
#     #         raise AttributeError("Must have 'latitude' and 'longitude'.")

#     @property
#     def test_attr(self):        
#         return self._obj.iloc[0]
    
#     @property
#     def incidence_matrix(self):
#         return self._obj.unstack(level=1, fill_value=0)
    

#     # def testing(self):
#     #     # plot this array's data on a map, e.g., using Cartopy
#     #     print(self.__str__)
#     #     pass

In [None]:
# coo_series = coo.squeeze()
# coo_series

In [None]:
# coo_series.levi.incidence_matrix

In [None]:
graph = nx.algorithms.bipartite.from_biadjacency_matrix(coo_matrix)
sets = nx.get_node_attributes(graph, name="bipartite")
colors = {0: 'gold', 1: 'lightskyblue'}

nx.draw(graph, with_labels=True, node_color=[colors[node[1]['bipartite']] 
                    for node in graph.nodes(data=True)])


In [None]:
graph.edges()

In [None]:
# Plotting with HyperNetx
import hypernetx as hnx

In [None]:
term_doc = doc_term.T

In [None]:
# *HNX* hypergraphs can be built from networkx bipartite graph objects using `from_bipartite`
H = hnx.Hypergraph.from_dataframe(term_doc)

In [None]:
hnx.drawing.draw(H)