# Pandas dataframe accessor: Levi graph
## COO &rarr; matrix

**Keeping track:**
- Create Levi graph (hyper graph version) with Incidence structure
  - Incidence matrix, simple graph, edges sum to 2 --> this is always **sparse**
- ~~Bipartite structure: matrix with entity on one index and connections on another index~~
- Tidy graph: edgelist and edge metadata graph
- Make sure pandas categorical works with ints
  - in nb, ss example has index as ints, not categoricals
  - pandas_categoricals
- Levi pandas accessor:
  - accessor to series, checks for multiindex categorical (this is the input)
  - instead of pandas.sparse
  - can control results to work with networkx etc
- Grabble object: built out of data classes 

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
import matplotlib.pyplot as plt
import beartype as bt

### Create data to use in development:
- Token Co-occurrence matrix
- Document-term matrix (this is an example of a bipartite structure/incidence matrix)
- COO (format: multi-index pandas Series)


In [2]:
# Create sample dataset

doc_ids = [str(x) for x in range(5)]
texts = [
    "the quick brown fox jumped",
    "the fox jumped",
    "the dog jumped",
    "one brown fox",
    "ten brown dogs",
   ]

df = pd.DataFrame({'doc_id': doc_ids,
                    'text':texts,
                    })

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words=['the'])
cv.fit(df['text'])
tokens = cv.get_feature_names_out()  #alphabetical

In [4]:
# Create cooccurence matrix

results = cv.transform(df['text'])
coocc = results.T.dot(results)
# np.fill_diagonal(coocc.values, 0)
coocc = pd.DataFrame(results.T.dot(results).toarray(), index=tokens, columns=tokens)
np.fill_diagonal(coocc.values, 0)
coocc

Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten
brown,0,0,1,2,1,1,1,1
dog,0,0,0,0,1,0,0,0
dogs,1,0,0,0,0,0,0,1
fox,2,0,0,0,2,1,1,0
jumped,1,1,0,2,0,0,1,0
one,1,0,0,1,0,0,0,0
quick,1,0,0,1,1,0,0,0
ten,1,0,1,0,0,0,0,0


In [5]:
# Create document term matrix (BIPARTITE STRUCTURE)

results = cv.transform(df['text'])
features = cv.get_feature_names_out()
doc_term = pd.DataFrame(results.toarray(), columns=features)
doc_term = doc_term.reindex(columns=doc_term.columns.tolist()+['cat', 'bear', 'tree'], fill_value=0)
doc_term

Unnamed: 0,brown,dog,dogs,fox,jumped,one,quick,ten,cat,bear,tree
0,1,0,0,1,1,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,0
4,1,0,1,0,0,0,0,1,0,0,0


In [6]:
# Create COO Series
# using scipy sparse module here, but we want to avoid it in production
coo_matrix = sparse.coo_matrix(doc_term.values)
# print(coo_matrix)
# coo_rows = pd.Series(coo_matrix.row)
coo_rows = coo_matrix.row
# # coo_cols = pd.Series(coo_matrix.col)
coo_cols = coo_matrix.col
# # coo_cols = doc_term.columns
# coo_vals = pd.Series(coo_matrix.data)
coo_vals = coo_matrix.data
print(coo_rows,
coo_cols,
coo_vals)
print(len(coo_rows),
len(coo_cols),
len(coo_vals))

term_dict = dict(zip(range(0,11), doc_term.columns))
term_dict

[0 0 0 0 1 1 2 2 3 3 3 4 4 4] [0 3 4 6 3 4 1 4 0 3 5 0 2 7] [1 1 1 1 1 1 1 1 1 1 1 1 1 1]
14 14 14


{0: 'brown',
 1: 'dog',
 2: 'dogs',
 3: 'fox',
 4: 'jumped',
 5: 'one',
 6: 'quick',
 7: 'ten',
 8: 'cat',
 9: 'bear',
 10: 'tree'}

In [8]:
term_dict = dict(zip(range(0,11), doc_term.columns))

In [9]:
coo_cols = coo_cols.tolist()
coo_col_names = [term_dict[key] for key in coo_cols]

In [14]:
coo = pd.DataFrame({"doc":coo_rows, "term":coo_col_names, "flag":coo_vals})
coo

Unnamed: 0,doc,term,flag
0,0,brown,1
1,0,fox,1
2,0,jumped,1
3,0,quick,1
4,1,fox,1
5,1,jumped,1
6,2,dog,1
7,2,jumped,1
8,3,brown,1
9,3,fox,1


In [15]:
# idx=pd.MultiIndex.from_arrays([coo_matrix.row, coo_col_names])
# coo2 = pd.DataFrame(coo_matrix.data, index=idx, columns=["flag"])
# coo2

In [None]:
# Make sure pandas categorical works with ints
#   - in nb, ss example has index as ints, not categoricals
#   - pandas_categoricals



In [None]:
# def of_beartype(t: bt.Union[type, bt.Tuple[type, ...]]) -> Predicate[object]:
#     """ From `phantom.predicates.generic.of_type`

#     Create a new predicate that succeeds when its argument is bearable on ``t``.
#     """

#     def check(a: object) -> bool:
#         return is_bearable(a, t)

#     return check

In [None]:
@pd.api.extensions.register_dataframe_accessor("levi")   #can also be df, series, or index
class LeviAccessor:
    def __init__(self, pandas_obj):
        self._validate(pandas_obj)
        self._obj = pandas_obj

    @staticmethod
    def _validate(obj):
        # TODO: use beartype
        # verify there is a column latitude and a column longitude
        if "latitude" not in obj.columns or "longitude" not in obj.columns:
            raise AttributeError("Must have 'latitude' and 'longitude'.")

    @property
    def index(self):        
        return (self.index)

    # @property
    # def center(self):
    #     # return the geographic center point of this DataFrame
    #     lat = self._obj.latitude
    #     lon = self._obj.longitude
    #     return (float(lon.mean()), float(lat.mean()))

    def plot(self):
        # plot this array's data on a map, e.g., using Cartopy
        pass

In [None]:
coo.levi.index