# Levi Pandas Accessor
## Transform between and among different graph data representations via a Levi graph

Helpful links:  
https://jendobson.com/2020/06/12/writing-custom-accessors-to-avoid-subclassing-pandas-dataframes/  
https://bpw1621.com/archive/extending-pandas/    
https://en.wikipedia.org/wiki/Levi_graph

In [36]:
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
import matplotlib.pyplot as plt
import beartype as bt

plt.rcParams['figure.dpi'] = 60


### Create data to use in development:
- Token Co-occurrence matrix
- Document-term matrix (this is an example of a bipartite structure/incidence matrix)
- COO (format: multi-index pandas Series)


In [37]:
# Create sample dataset

doc_ids = [str(x) for x in range(5)]
texts = [
    "the quick brown fox jumped",
    "the fox jumped",
    "the dog jumped",
    "one brown fox",
    "ten brown dogs",
]

df = pd.DataFrame({'doc_id': doc_ids,
                   'text': texts,
                   })


In [38]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words=['the'])
cv.fit(df['text'])
tokens = cv.get_feature_names_out()  # alphabetical


In [39]:
tokens


array(['brown', 'dog', 'dogs', 'fox', 'jumped', 'one', 'quick', 'ten'],
      dtype=object)

In [40]:
# Create cooccurence matrix

results = cv.transform(df['text'])
# coocc = results.T.dot(results)
# np.fill_diagonal(coocc.values, 0)
coocc = pd.DataFrame(results.T.dot(results).toarray(), index=pd.CategoricalIndex(
    tokens), columns=pd.CategoricalIndex(tokens))
np.fill_diagonal(coocc.values, 0)
coocc


Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten
brown,0,0,1,2,1,1,1,1
dog,0,0,0,0,1,0,0,0
dogs,1,0,0,0,0,0,0,1
fox,2,0,0,0,2,1,1,0
jumped,1,1,0,2,0,0,1,0
one,1,0,0,1,0,0,0,0
quick,1,0,0,1,1,0,0,0
ten,1,0,1,0,0,0,0,0


In [41]:
# Create document term matrix (BIPARTITE STRUCTURE)

results = cv.transform(df['text'])
features = cv.get_feature_names_out()
doc_term = pd.DataFrame(results.toarray(), columns=features)
doc_term = doc_term.reindex(
    columns=doc_term.columns.tolist()+['cat', 'bear', 'tree'], fill_value=0)
doc_term


Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten,cat,bear,tree
0,1,0,0,1,1,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,0
4,1,0,1,0,0,0,0,1,0,0,0


In [42]:
# Create COO Dataframe
# using scipy sparse module here, but we want to avoid it in production
coo_matrix = sparse.coo_matrix(doc_term.values)
coo_rows = coo_matrix.row
coo_cols = coo_matrix.col
coo_vals = coo_matrix.data

term_dict = dict(zip(range(0, 11), doc_term.columns))
term_dict


{0: 'brown',
 1: 'dog',
 2: 'dogs',
 3: 'fox',
 4: 'jumped',
 5: 'one',
 6: 'quick',
 7: 'ten',
 8: 'cat',
 9: 'bear',
 10: 'tree'}

In [43]:
print(coo_matrix)  # scipy sparse format


  (0, 0)	1
  (0, 3)	1
  (0, 4)	1
  (0, 6)	1
  (1, 3)	1
  (1, 4)	1
  (2, 1)	1
  (2, 4)	1
  (3, 0)	1
  (3, 3)	1
  (3, 5)	1
  (4, 0)	1
  (4, 2)	1
  (4, 7)	1


In [44]:
coo_cols = coo_cols.tolist()
coo_col_names = [term_dict[key] for key in coo_cols]


In [45]:
idx = pd.MultiIndex.from_arrays([coo_matrix.row, coo_col_names])
coo = pd.DataFrame(coo_matrix.data, index=idx, columns=["flag"])
coo


Unnamed: 0,Unnamed: 1,flag
0,brown,1
0,fox,1
0,jumped,1
0,quick,1
1,fox,1
1,jumped,1
2,dog,1
2,jumped,1
3,brown,1
3,fox,1


In [46]:
coo.index


MultiIndex([(0,  'brown'),
            (0,    'fox'),
            (0, 'jumped'),
            (0,  'quick'),
            (1,    'fox'),
            (1, 'jumped'),
            (2,    'dog'),
            (2, 'jumped'),
            (3,  'brown'),
            (3,    'fox'),
            (3,    'one'),
            (4,  'brown'),
            (4,   'dogs'),
            (4,    'ten')],
           )

In [47]:
# new = pd.CategoricalDtype(coo.index)
# new


In [48]:
levi_series = coo.squeeze()
levi_series.index


MultiIndex([(0,  'brown'),
            (0,    'fox'),
            (0, 'jumped'),
            (0,  'quick'),
            (1,    'fox'),
            (1, 'jumped'),
            (2,    'dog'),
            (2, 'jumped'),
            (3,  'brown'),
            (3,    'fox'),
            (3,    'one'),
            (4,  'brown'),
            (4,   'dogs'),
            (4,    'ten')],
           )

In [49]:
levi_series


0  brown     1
   fox       1
   jumped    1
   quick     1
1  fox       1
   jumped    1
2  dog       1
   jumped    1
3  brown     1
   fox       1
   one       1
4  brown     1
   dogs      1
   ten       1
Name: flag, dtype: int64

### Converting Levi series to other graph formats

This demonstrates `series.py`

In [50]:
from levi import series


Test failed validation function. Gives `test failed validation function` error because the series does not have a two-column multiindex.

In [51]:
# not_levi_series = pd.Series([0,7,8])
# not_levi_series.levi.to_incidence


In [52]:
levi_series


0  brown     1
   fox       1
   jumped    1
   quick     1
1  fox       1
   jumped    1
2  dog       1
   jumped    1
3  brown     1
   fox       1
   one       1
4  brown     1
   dogs      1
   ten       1
Name: flag, dtype: int64

**Convert Levi to edgelist, where first column is source, second is target, and third is the edge value**

In [53]:
el = levi_series.levi.to_edgelist(level_0="term_0", level_1="term_1")
el


*** test validator***


Unnamed: 0,term_0,term_1,0
0,brown,brown,0
1,brown,dog,0
2,brown,dogs,1
3,brown,fox,2
4,brown,jumped,1
...,...,...,...
59,ten,fox,0
60,ten,jumped,0
61,ten,one,0
62,ten,quick,0


**Convert Levi to adjacency matrix (both indices are the same)**

In [54]:
adjacency_matrix = levi_series.levi.to_adjacency()
adjacency_matrix


Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten
brown,0,0,1,2,1,1,1,1
dog,0,0,0,0,1,0,0,0
dogs,1,0,0,0,0,0,0,1
fox,2,0,0,0,2,1,1,0
jumped,1,1,0,2,0,0,1,0
one,1,0,0,1,0,0,0,0
quick,1,0,0,1,1,0,0,0
ten,1,0,1,0,0,0,0,0


**Convert Levi to biadjacency, where row index is one type of node (here, the document IDs) and column index is another type of node (tokens)**

In [55]:
biadjacency_matrix = levi_series.levi.to_biadjacency()
biadjacency_matrix


Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten
0,1,0,0,1,1,0,1,0
1,0,0,0,1,1,0,0,0
2,0,1,0,0,1,0,0,0
3,1,0,0,1,0,1,0,0
4,1,0,1,0,0,0,0,1


### Creating a new Levi graph: Key feature of Grabble

Taking an original Levi graph of two "things", transform it into new Levi where these things have been combined to one category. The second category in the new Levi is the connections of the things in the original Levi.
For instance, we combine documents and tokens into a single category. The second category is from the indexing of the orginial Levi structure and represents a new "thing", which is the instance of a particular document or term connection. 

In [56]:
new_levi = levi_series.levi.to_new_levi()
new_levi


index  level_0
0      0.0        1
1      0.0        1
2      0.0        1
3      0.0        1
4      1.0        1
5      1.0        1
6      2.0        1
7      2.0        1
8      3.0        1
9      3.0        1
10     3.0        1
11     4.0        1
12     4.0        1
13     4.0        1
0      brown      1
1      fox        1
2      jumped     1
3      quick      1
4      fox        1
5      jumped     1
6      dog        1
7      jumped     1
8      brown      1
9      fox        1
10     one        1
11     brown      1
12     dogs       1
13     ten        1
Name: flag, dtype: object

In [57]:
new_levi.index.names


FrozenList(['index', 'level_0'])

In [58]:
new_levi.name


'flag'

### Converting new levi graph to another form:

Now that we have a new Levi graph, we can turn it into the other graph formats (adjacency, biadjacency, and edgelist), just as we did with the original Levi:

In [59]:
new_levi_edgelist = new_levi.levi.to_edgelist()
new_levi_edgelist.head(15)


*** test validator***


Unnamed: 0,level_0,level_1,0
0,0.0,0.0,0
1,0.0,1.0,0
2,0.0,2.0,0
3,0.0,3.0,0
4,0.0,4.0,0
5,0.0,brown,1
6,0.0,dog,0
7,0.0,dogs,0
8,0.0,fox,1
9,0.0,jumped,1


In [60]:
new_levi_adjacency = new_levi.levi.to_adjacency(
    level_0="index", level_1="level_0")
new_levi_adjacency


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,brown,dog,dogs,fox,jumped,one,quick,ten
0.0,0,0,0,0,0,1,0,0,1,1,0,1,0
1.0,0,0,0,0,0,0,0,0,1,1,0,0,0
2.0,0,0,0,0,0,0,1,0,0,1,0,0,0
3.0,0,0,0,0,0,1,0,0,1,0,1,0,0
4.0,0,0,0,0,0,1,0,1,0,0,0,0,1
brown,1,0,0,1,1,0,0,0,0,0,0,0,0
dog,0,0,1,0,0,0,0,0,0,0,0,0,0
dogs,0,0,0,0,1,0,0,0,0,0,0,0,0
fox,1,1,0,1,0,0,0,0,0,0,0,0,0
jumped,1,1,1,0,0,0,0,0,0,0,0,0,0


In [61]:
new_levi_biadjacency = new_levi.levi.to_biadjacency()
new_levi_biadjacency


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,brown,dog,dogs,fox,jumped,one,quick,ten
0,1,0,0,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,1,0,0,0,0,0,0,0,1,0,0,0
6,0,0,1,0,0,0,1,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,1,0,0,0
8,0,0,0,1,0,1,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,1,0,0,0,0


In [62]:
another_new_levi = new_levi.levi.to_new_levi()
another_new_levi


index  level_0
0      0.0        1
1      1.0        1
2      2.0        1
3      3.0        1
4      4.0        1
5      5.0        1
6      6.0        1
7      7.0        1
8      8.0        1
9      9.0        1
10     10.0       1
11     11.0       1
12     12.0       1
13     13.0       1
14     0.0        1
15     1.0        1
16     2.0        1
17     3.0        1
18     4.0        1
19     5.0        1
20     6.0        1
21     7.0        1
22     8.0        1
23     9.0        1
24     10.0       1
25     11.0       1
26     12.0       1
27     13.0       1
0      0.0        1
1      0.0        1
2      0.0        1
3      0.0        1
4      1.0        1
5      1.0        1
6      2.0        1
7      2.0        1
8      3.0        1
9      3.0        1
10     3.0        1
11     4.0        1
12     4.0        1
13     4.0        1
14     brown      1
15     fox        1
16     jumped     1
17     quick      1
18     fox        1
19     jumped     1
20     dog        1
21   

### testing out beartype definitions for our structures

In [63]:
from beartype import beartype
from beartype.vale import Is, IsAttr, IsEqual, IsSubclass, IsInstance
from typing import Annotated

import pandera as pa


In [64]:
series_2Dindex = Annotated[pd.Series,  # need this pd.Series?
                           IsInstance[pd.Series] &
                           IsAttr['index', IsInstance[pd.MultiIndex]]
                           ]


@beartype
def validate_levi(obj: series_2Dindex):
    print("passed Levi series validator")


validate_levi(levi_series)
validate_levi(new_levi)


passed Levi series validator
passed Levi series validator


In [65]:
edgelist_dataframe = Annotated[pd.DataFrame,
                               IsInstance[pd.DataFrame],
                               Is[lambda el:
                                  set(el.iloc[:, 0]) == set(el.iloc[:, 1])]]


@beartype
def validate_el(obj: edgelist_dataframe):
    print("passed edgelist validator")


validate_el(el)


passed edgelist validator


In [66]:
adjacency_dataframe = Annotated[pd.DataFrame,
                                IsInstance[pd.DataFrame],
                                Is[lambda adj:
                                    adj.index.equals(adj.columns)],
                                Is[lambda adj:
                                    not np.diag(adj).any()]]


@beartype
def validate_adj(obj: adjacency_dataframe):
    print("passed adjacency matrix validator")


validate_adj(adjacency_matrix)


passed adjacency matrix validator


In [67]:
biadjacency_matrix


Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten
0,1,0,0,1,1,0,1,0
1,0,0,0,1,1,0,0,0
2,0,1,0,0,1,0,0,0
3,1,0,0,1,0,1,0,0
4,1,0,1,0,0,0,0,1


In [68]:
biadjacency_dataframe = Annotated[pd.DataFrame,
                                  IsInstance[pd.DataFrame],
                                  ]


@beartype
def validate_biadj(obj: biadjacency_dataframe):
    print("passed biadjacency matrix validator")


validate_biadj(biadjacency_matrix)


passed biadjacency matrix validator


In [69]:
type(biadjacency_dataframe)


typing._AnnotatedAlias

In [70]:
import pandera as pa

biadj_inferred_schema = pa.infer_schema(biadjacency_matrix)

with open("inferred_schema.py", "w") as file:
    file.write(biadj_inferred_schema.to_script())

try:
    schema.validate(products, lazy=True)
except pa.errors.SchemaErrors as err:
    print(err)


ImportError: IO and formatting requires 'pyyaml', 'black' and 'frictionless'to be installed.
You can install pandera together with the IO dependencies with:
pip install pandera[io]


### Convert between graph (matrix) formats:

In [None]:
# import numpy as np
# import scipy.sparse as sp

# def cooccurrence_to_levi(cooccurrence_matrix):
#     """
#     Convert a co-occurrence matrix to Levi structure.
    
#     Parameters:
#         cooccurrence_matrix (numpy.ndarray or scipy.sparse.csr_matrix): Co-occurrence matrix.
    
#     Returns:
#         numpy.ndarray: Levi structure matrix.
#     """
#     if isinstance(cooccurrence_matrix, np.ndarray):
#         cooccurrence_matrix = sp.csr_matrix(cooccurrence_matrix)
        
#     N = cooccurrence_matrix.shape[0]
#     levi_matrix = np.zeros((N, N), dtype=int)
    
#     for i in range(N):
#         for j in range(i + 1, N):
#             if cooccurrence_matrix[i, j] > 0:
#                 levi_matrix[i, j] = 1
#                 levi_matrix[j, i] = 1
                
#     return levi_matrix

# def levi_to_edgelist(levi_matrix):
#     """
#     Convert Levi structure matrix to an edge list.
    
#     Parameters:
#         levi_matrix (numpy.ndarray): Levi structure matrix.
    
#     Returns:
#         list: List of edges.
#     """
#     edges = []
#     N = levi_matrix.shape[0]
    
#     for i in range(N):
#         for j in range(i + 1, N):
#             if levi_matrix[i, j] > 0:
#                 edges.append((i, j))
                
#     return edges

# # Example usage
# cooccurrence_matrix = np.array([[0, 2, 1], [2, 0, 0], [1, 0, 0]])
# levi_matrix = cooccurrence_to_levi(cooccurrence_matrix)
# edges = levi_to_edgelist(levi_matrix)

# print("Co-occurrence matrix:")
# print(cooccurrence_matrix)
# print("\nLevi structure matrix:")
# print(levi_matrix)
# print("\nEdge list:")
# for edge in edges:
#     print(edge)



In [None]:
# levi_cgpt = cooccurrence_to_levi(adjacency_matrix)
# levi_cgpt

In [None]:
def cooccurrence_to_edgelist(cooc_mat):
    # Get the indices of the non-zero elements in the matrix
    nonzero_indices = np.nonzero(cooc_mat)

    # Create a list of edges
    edges = []
    for i, j in zip(nonzero_indices[0], nonzero_indices[1]):
        edges.append((i, j, cooc_mat[i, j]))

    return edges

cooccurrence_to_edgelist(adjacency_matrix)

### Converting graph formats into Levi

Say we have a biadjacency matrix, representing documents and terms.

In [None]:
# TODO: handle categorical data wihin functions
doc_term.index = doc_term.index.astype('category')
doc_term.columns = doc_term.columns.astype('category')
doc_term


...and we want to project this graph in a new way, where documents and terms are both the same "type" of thing, basically a new type of node which includes both. 

We want to create a **new levi graph**, where the nodes have been redefined.

In [None]:
# Convert to Levi format
levi_from_doc_term = doc_term.levi.biadjacency_to_levi()
levi_from_doc_term


In [None]:
def to_new_levi(lv, level_0="level_0",  level_1="level_1"):
    lv.index.names = [None, None]
    levi_df = lv.reset_index()  # .rename(
    # columns={"level_0": level_0, "level_1": level_1})
    new_levi_df = (pd.concat([levi_df.iloc[:, [0, 2]], levi_df.iloc[:, [1, 2]]])
                   .reset_index()
                   .rename(columns={level_1: level_0})
                   .stack()
                   .unstack()
                   )
    new_levi = new_levi_df.set_index(["index", "level_0"]).squeeze()
    return new_levi


to_new_levi(levi_from_doc_term)


In [None]:
new_levi_from_doc_term = levi_from_doc_term.levi.to_new_levi(
    level_0="index",  level_1="variable")
new_levi_from_doc_term


### Trying out Plum

In [None]:
# from plum import dispatch


In [None]:
# df_edgelist = Annotated[pd.Dataframe,  # need this pd.Series?
#                         IsInstance[pd.Dataframe] &
#                         # IsAttr['index', IsInstance[pd.MultiIndex]]
#                                ]


**Scratchwork:**

In [None]:
# def to_new_levi(old_levi, level_0='level_0',  level_1='level_1'):
#     el = old_levi.levi.to_edgelist()
#     new_levi_el = (pd.concat([el[[level_0, "flag"]], el[[level_1, "flag"]]])
#                         .reset_index()
#                         .rename(columns={level_1: level_0})
#                         .stack()
#                         .unstack()
#                     )
#     new_levi = pd.Series(new_levi_el.flag)
#     new_levi.index = (pd.MultiIndex.from_arrays(new_levi_el.values.T))
#     return new_levi


In [None]:
# new_2 = pd.concat([el_1[["level_0", "flag"]], el_1[["level_1", "flag"]]]).reset_index().rename(columns={'level_1': 'level_0'}).stack().unstack()
# # index2 = pd.MultiIndex(new_2[["index", "level_0"]])
# # index2
# # new_2 = pd.Series(index=index2, data=new_2.flag)
# # # new_2 = pd.MultiIndex(new_2['level_0'].fillna(new_2['level_1']))

# foo = pd.Series(new_2.flag) #{"C":[100,200,300]}
# foo.index = pd.MultiIndex.from_arrays(new_2.values.T)
# foo
# new_2


In [None]:
# type(new_2)


In [None]:
# Notes/code scratchwork from meeting on 3/16

# edgelist
# pd.concat([
#     edgelist.set_index([edgelist.index, 'doc'])['flag'], #.notna(#.sum(axis=1))
#     edgelist.set_index([edgelist.index, 'term'])['flag']
# ]) #.index

# pd.CategoricalDtype


In [None]:
# edgelist.set_index([edgelist.index, 'doc'])['flag'], #.notna(#.sum(axis=1))


In [None]:

# new_thing = pd.concat([
#             edgelist.set_index([edgelist.index, 'doc']),
#             edgelist.set_index([edgelist.index, 'term'])
#             ]
#     )


# new_thing
# projected_levi = pd.Series(new_thing[[str(data)]])
# projected_levi


In [None]:
# def new_levi(og_levi, level_0 = 'level_0',  level_1 = 'level_1'):
#     el = og_levi.levi.to_edgelist(level_0=level_0, level_1=level_1)
#     print(el.columns)
#     # new_index = pd.MultiIndex.from_arrays([el.index, list(zip(el.level_0, el.level_1))])
#     new_levi = pd.concat([
#             el.set_index([el.index, level_0]),
#             el.set_index([el.index, level_1])
#             ]
#     )

#     # new_levi = pd.Series(data=el.flag.values, index=new_index)
#     new_levi = el.set_index([el.index, list(zip(el.level_0, el.level_1))])
#     # new_levi = el.set_index([el.index, el.level_0])
#     return new_levi

# test_new_levi = new_levi(levi_series, level_0 = 'doc',  level_1 = 'term')
# test_new_levi


In [None]:
# type(test_new_levi)


In [None]:
# test_new_levi.index


In [None]:
# import levi
# from pandas import DataFrame

# def to_adjacency(levi, level_0: str = "level_0", level_1: str = "level_1") -> DataFrame:
#         # similar to nx.from_pandas_edgelist()
#         # TODO: handle different names for level_0, level_1
#         df = levi.to_frame().reset_index() # moves multiindex into columns
#         A = pd.crosstab(df.level_0, df.level_1)
#         # df2 = A.T @ A
#         # np.fill_diagonal(df2.values, 0)
#         # df2.index.name = None
#         return A #df2

# test_adj = to_adjacency(test_new_levi)
# test_adj


In [None]:
# more notes
from scipy.sparse import coo_array

# A = coo_array(students@stuents.T)
A = coo_array(coocc@coocc.T)
# pd.Series.sparse.from_coo(A).unstack()
# pd.Series(index= pd.MultiIndex.from_arrays([A.row, A.col]))


In [None]:
levi_series


In [None]:
# new_levi = edgelist.set_index(edgelist.index, edgelist.columns[0]) #.unstack()
# # new_levi = edgelist.set_index(edgelist.index, edgelist.columns[0]) #.unstack()

# new_levi


### Converting different formats to Levi

In [None]:
edgelist.levi.edgelist_to_levi()


In [None]:
coocc.levi.adjacency_to_levi()


In [None]:
adjacency_matrix.levi.adjacency_to_levi()


In [None]:
coocc.melt(ignore_index=False).squeeze()


In [None]:
biadjacency_matrix.levi.biadjacency_to_levi()


In [None]:
doc_term.levi.biadjacency_to_levi()


In [None]:

def adjacency_to_levi(adj):
    levi = adj.melt(ignore_index=False).squeeze()
    return levi


adjacency_to_levi(adjacency_matrix)


In [None]:
el_og = edgelist
el_og


In [None]:
levi_series


In [None]:
el_og.groupby("doc").set_index()


In [None]:
def edgelist_to_levi(el):
    # TODO
    # levi to edgelist
    nind = pd.MultiIndex()
    levi = pd.Series.set_in
    # edgelist_df = self._obj.reset_index().rename(
    #     columns={"level_0": level_0, "level_1": level_1}
    return self


### Converting between different formats, using Levi as common intermediate format

In [None]:
from levi import dataframe


In [None]:
adjacency_matrix.levi.adjacency_to_edgelist()


In [None]:
edgelist.levi.edgelist_to_biadjacency(
    source_name="doc_ids", target_name="tokens")


In [None]:
def edgelist_to_incidence(edgelist, node_colname, value_colname=None):
    """assume edgelist is indexed by edge number, not some edge set of names (for now)"""
    data = np.ones_like(
        edgelist.index.values) if value_colname == None else edgelist[value_colname].values
    # coo_array((data, (row_idx, col_idx)))
    return sparse.coo_array((data, (edgelist.index, edgelist[node_colname].cat.codes)), shape=(edgelist.shape[0], len(edgelist[node_colname].cat.categories)))


In [None]:
inc = edgelist_to_incidence(edgelist, node_colname='term')
print(inc)


In [None]:
# tuple_list = list(zip(coo_rows, coo_col_names, coo_matrix.data))
tuple_list = list(zip(coo_rows, coo_col_names, ))
tuple_list_w_data = [(*t, 1) for t in tuple_list]


In [None]:
import itertools
row_index = dict(zip(tuple_list_w_data, itertools.count()))
row_index


In [None]:
# array_x = nx.algorithms.bipartite.biadjacency_matrix(graph, row_order=tuple_list_w_data)
# array_x


In [None]:
# Make sure pandas categorical works with ints
#   - in nb, ss example has index as ints, not categoricals
#   - pandas_categoricals


In [None]:
coo.unstack(level=1, )


In [None]:
type(coocc)


In [None]:
import levi

coo.levi


For now, assume "input" is a doc-term matrix

In [None]:
# from affinity-to-edge

def affinity_to_edge(source_name,  # type: str
                     target_name,  # type: str
                     affinity,  # type: DF
                     value_name=None  # type: Optional[str]
                     ):
    # type: (...) -> DF
    """DEPRECATED"""
    return (affinity.reset_index().melt(source_name, value_name=value_name, var_name=target_name).query('weight>0'))


In [None]:
doc_term.columns.dtype


### Testing dataframe accessor

In [None]:
# ba = edgelist.levi.edgelist_to_biadjacency()
# ba


## pandas-flavor

https://github.com/pyjanitor-devs/pandas_flavor   
another approach to extending pandas for levi?

In [None]:
# example code using pandas-flavor

# def add_method(key, val, fn_name=None):
#     def fn(df):
#         return df.loc[df[key] == val]

#     if fn_name is None:
#         fn_name = f'{key}_{val}'

#     fn.__name__ = fn_name
#     fn = pf.register_dataframe_method(fn)
#     return fn

# for name1 in ['john', 'lisa']:
#     add_method('name1', name1)

# for name2 in ['fay', 'meg', 'wil']:
#     add_method('name2', name2)


# # OR, another approach:

# @pf.register_dataframe_method
# def name1(df, val):
#     return df.loc[df['name1'] == val]

# @pf.register_dataframe_method
# def name2(df, val):
#     return df.loc[df['name2'] == val]

# test.name1('lisa')
# #   name1 name2  scoreA  scoreB
# # 1  lisa   wil    9.67    8.87
# # 2  lisa   fay    3.41    5.04
# # 3  lisa   wil    0.58    6.12

# test.name1('lisa').name2('wil')
# #   name1 name2  scoreA  scoreB
# # 1  lisa   wil    9.67    8.87
# # 3  lisa   wil    0.58    6.12


### Type validation, `beartype` stuff (mostly scratchwork)

In [None]:
from beartype import beartype
from beartype import typing as bt
from beartype.door import TypeHint as th

from typing_extensions import NamedTuple
# import static_frame as sf

from dataclasses import dataclass, InitVar
import networkx as nx
# from typing import NamedTuple
from IPython.display import display
import numpy as np
from numpy.typing import DTypeLike


In [None]:
from beartype import beartype
from beartype.vale import IsAttr, IsEqual, IsSubclass, IsInstance
from typing import Annotated


In [None]:
# Type hint matching only two-dimensional NumPy arrays of floats of
# arbitrary precision. This time, do it faster than anyone has ever
# type-checked NumPy arrays before. (Cue sonic boom, Chuck Yeager.)
import numpy as np
Numpy2DFloatArray = Annotated[np.ndarray,
                              IsAttr['ndim', IsEqual[2]] &
                              IsAttr['dtype', IsAttr['type',
                                                     IsSubclass[np.floating]]]
                              ]


In [None]:
levi_series.index


In [None]:
series_2Dindex = Annotated[pd.Series,  # need this Series?
                           IsInstance[pd.Series] &
                           IsAttr['index', IsInstance[pd.MultiIndex]]
                           ]


In [None]:
@beartype
def test_series_stuff(levi: series_2Dindex):
    print("*** TEST ***")


test_series_stuff(levi_series)


In [None]:
test_series_stuff(edgelist)


In [None]:
test_series_stuff(levi)


In [None]:
from phantom import Phantom
from phantom.predicates import Predicate
import beartype.typing as bt
from beartype.door import is_bearable


In [None]:
def of_beartype(t: bt.Union[type, bt.Tuple[type, ...]]) -> Predicate[object]:
    """ From `phantom.predicates.generic.of_type`

    Create a new predicate that succeeds when its argument is bearable on ``t``.
    """

    def check(a: object) -> bool:
        return is_bearable(a, t)

    return check


In [None]:
# chck = of_beartype(coo)
# chck(coo)


In [None]:
# beartype sample code

# Annotate @beartype-decorated callables with beartype validators.
@beartype
def polygon_area(polygon: Numpy2DFloatArray) -> float:
    '''
    Area of a two-dimensional polygon of floats defined as a set of
    counter-clockwise points, calculated via Green's theorem.

    *Don't ask.*
    '''

    # Calculate and return the desired area. Pretend we understand this.
    polygon_rolled = np.roll(polygon, -1, axis=0)
    return np.abs(0.5*np.sum(
        polygon[:, 0]*polygon_rolled[:, 1] -
        polygon_rolled[:, 0]*polygon[:, 1]))


In [None]:
from beartype.vale import IsInstance


class Token(str):
    ...


@beartype
class Tester(bt.Annotated[str, IsInstance[Token]]):
    ...


isinstance(Tester('abc'), Token)


In [None]:
# # Import the requisite machinery.
# from beartype.vale import Is
# from typing import Annotated   # <--------------- if Python ≥ 3.9.0

# # Type hint matching only strings with lengths ranging [4, 40].
# LengthyString = Annotated[str, Is[lambda text: 4 <= len(text) <= 40]]

# test_typing = Annotated[pd.Series, Is[pd.Series]]
# test_typing


In [None]:
# ids = ('id-student', 'id-project')

# testing = doc_term.reset_index().melt(0, value_name="brown", ).query('weight>0')   #var_name=target_name
# testing


In [None]:
# pd.DataFrame.melt

# testing = doc_term.reset_index().melt(source_name, value_name=value_name, var_name=target_name).query('weight>0')
# testing


### Plotting Graphs
Plan to eventually add functionality in Grabble to easily plug into packages for easy plotting, compatable with the Levi-format workflow. Networkx and Hypernetx are two options.

In [None]:
new_graph = new_levi.levi.to_biadjacency

graph = nx.algorithms.bipartite.from_biadjacency_matrix(new_graph)
sets = nx.get_node_attributes(graph, name="bipartite")
colors = {0: 'gold', 1: 'lightskyblue'}

nx.draw(graph, with_labels=True, node_color=[colors[node[1]['bipartite']]
                                             for node in graph.nodes(data=True)])


In [None]:
coo_matrix


In [None]:
graph = nx.algorithms.bipartite.from_biadjacency_matrix(coo_matrix)
sets = nx.get_node_attributes(graph, name="bipartite")
colors = {0: 'gold', 1: 'lightskyblue'}

nx.draw(graph, with_labels=True, node_color=[colors[node[1]['bipartite']]
                                             for node in graph.nodes(data=True)])


In [None]:
graph.edges()


In [None]:
# Plotting with HyperNetx
import hypernetx as hnx


In [None]:
term_doc = doc_term.T


In [None]:
# *HNX* hypergraphs can be built from networkx bipartite graph objects using `from_bipartite`
H = hnx.Hypergraph.from_dataframe(term_doc)


In [None]:
hnx.drawing.draw(H)
