In [1]:
import pandas as pd
import numpy as np

In [2]:
all_df = pd.read_pickle("DATA/BH.pkl")
all_df = all_df[pd.to_numeric(all_df['NUMBER OF MONTHS REPORTED'], errors='coerce').notnull()]

In [3]:
all_df.columns

Index(['SEGMENT LEVEL', 'NUMERIC STATE CODE', 'ORI', 'DATE ORI WAS ADDED',
       'DATE ORI WENT NIBRS', 'CITY NAME', 'STATE ABBREVIATION',
       'POPULATION GROUP', 'COUNTRY DIVISION', 'COUNTRY REGION',
       'AGENCY INDICATOR', 'CORE CITY', 'COVERED-BY ORI', 'FBI FIELD OFFICE',
       'JUDICIAL DISTRICT', 'AGENCY NIBRS FLAG', 'AGENCY INACTIVE DATE',
       'CURRENT POPULATION 1', 'UCR COUNTY CODE 1', 'MSA CODE 1',
       'LAST POPULATION  1', 'CURRENT POPULATION 2', 'UCR COUNTY CODE 2',
       'MSA CODE 2', 'LAST POPULATION  2', 'CURRENT POPULATION 3',
       'UCR COUNTY CODE 3', 'MSA CODE 3', 'LAST POPULATION  3',
       'CURRENT POPULATION 4', 'UCR COUNTY CODE 4', 'MSA CODE 4',
       'LAST POPULATION  4', 'CURRENT POPULATION 5', 'UCR COUNTY CODE 5',
       'MSA CODE 5', 'LAST POPULATION 5', '01-06-12 INDICATOR',
       'NUMBER OF MONTHS REPORTED', 'MASTER FILE YEAR',
       'AGENCY ACTIVITY INDICATOR 1', 'AGENCY ACTIVITY INDICATOR 2',
       'AGENCY ACTIVITY INDICATOR 3', 'AGENC

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from abc import ABC, abstractmethod
from functools import cached_property
import networkx as nx

class Similarity(ABC):
    @abstractmethod
    def find_similarity_matrix(self, df:pd.DataFrame):
        ...

class CosineSimilarity(Similarity):
    def __init__(self, threshold:float):
        self._th = threshold

    def find_similarity_matrix(self, df: pd.DataFrame):
       self.similarity_matrix = cosine_similarity(df.to_numpy())
       return self.similarity_matrix > self._th


class SimilarityNetwork:
    def __init__(self, df:pd.DataFrame, similarity_measure: Similarity, index_name:str, columns:list[str]):
        self._df: pd.DataFrame = df.set_index(index_name)[columns] # type: ignore
        self.sm = similarity_measure
        self.nodes = index_name
    def fit_transform(self):
        self.adjacency_matrix = self.sm.find_similarity_matrix(self._df)
    @cached_property
    def network(self):
        _G  = nx.from_numpy_matrix(self.adjacency_matrix)
        return nx.relabel_nodes(_G, {i:n for i, n in enumerate(self.nodes)})

In [4]:
import numpy as np

# Example data
data = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [-3, 8, -9],
    [3,-6,2]
]).T # Replace this with your actual data

df = pd.DataFrame(data, columns=["I","a", "b", "c"]).set_index("I")

In [6]:
cs1 = CosineSimilarity(0.5)
cs1.find_similarity_matrix(all_df.query("year == 1991")[ ["NUMBER OF MONTHS REPORTED", "CURRENT POPULATION 1"] ])

array([[False, False, False, ..., False, False, False],
       [False,  True,  True, ..., False, False, False],
       [False,  True,  True, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [14]:
import networkx as nx
import numpy as np

m1 = np.matrix([
    [1,0,1,0],
    [0,0,0,0],
    [0,1,1,1],
    [1,1,0,1]
])
G1 = nx.from_numpy_array(m1, create_using=nx.DiGraph)

In [20]:
nx.relabel_nodes(G1, {i:n for i,n in enumerate("abcd")}, copy=False)

<networkx.classes.digraph.DiGraph at 0x23b7f72cd90>

In [21]:
G1.edges

OutEdgeView([('a', 'a'), ('a', 'c'), ('c', 'c'), ('c', 'b'), ('c', 'd'), ('d', 'd'), ('d', 'a'), ('d', 'b')])