## Louvain Community Analysis Generator
This notebook accepts arbitrary columnar data in CSV format, creates an undirected graph from the columns of the data, and uses a Louvain community analysis algorithm to find "communities" within the graph.  From there, it identifies any communities that have a size that is statistically smaller than the other communities detected.  This is useful for identifying rare/abnormal tuples of data in your original table. 

Source: https://python-louvain.readthedocs.io/en/latest/api.html#

In [None]:
# Install the Louvain/Community modules.  Restart kernel after install.
!pip install --upgrade community
!pip install python-louvain

In [None]:
# Imports
import pandas as pd
import plotly.express as px
import networkx as nx
import collections
from collections import Counter
from networkx.drawing.nx_agraph import write_dot
from networkx.algorithms import bipartite
import networkx.algorithms.community as nxcom
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import community
import os
from community import community_louvain

In [None]:
# Supply a CSV file.  This can consist of any data you like; you can save a lot of processing time if you pre-filter
# this to include only the columns you want to analyze.  The data should have a header row.
# The file should be in the same directory as this notebook; it can be uploaded via the Files page in Jupyter Notebook.

path = os.getcwd()
file = input("Enter the location of a CSV file:")

try:
    sheetpath = os.path.join(path, file)
    df = pd.read_csv(sheetpath, header=0)
    print('Ingested ' + str(df.shape[0]) + ' lines of data')
except Exception:
    print('File not found, please try again')

In [None]:
# Make an undirected NetworkX graph out of the parent-child process data.
# We will then attempt to identify communities within the graph results.
network = nx.Graph()  # undirected graph

# Build graph nodes and weighted edges
for i in range(len(df)):
    entry = df.iloc[i]
    for j in range(len(entry) - 1):  # this is the number of columns
        selector = df.columns[j]
        value1 = df.iloc[i][j]
        value2 = df.iloc[i][j + 1]

        if not value1 in network:
            if j == 0:  # first column, so this is the originator field
                network.add_node(value1, type=selector, color="blue", size=2, weight=1)
            else:  # it's a downstream node
                network.add_node(value1, type=selector, size=1, weight=1)
        if not network.has_edge(value1, value2):
            network.add_edge(value1, value2, weight=1, arrows=True, penwidth=1)
        else:
            network[value1][value2]["weight"] += 1
            network.add_edge(
                value1,
                value2,
                weight=network[value1][value2]["weight"],
                arrows=True,
                penwidth=network[value1][value2]["weight"],
            )
        j = j + 1
    i = i + 1
    
# Render the graph
nx.draw_spring(network)

In [None]:
# Use the Louvain community detection algorithm, to identify unique communities within the graph.
# Use Matplotlib to display the resultant graph, with each community in a different color.

partition = community_louvain.best_partition(network)

# generate the graph
pos = nx.spring_layout(network)
# color the nodes according to their partition
cmap = cm.get_cmap("viridis", max(partition.values()) + 1)
nx.draw_networkx_nodes(
    network,
    pos,
    partition.keys(),
    node_size=40,
    cmap=cmap,
    node_color=list(partition.values()),
)
nx.draw_networkx_edges(network, pos, alpha=0.5)

plt.show()

In [None]:
# Create a new dataframe from the "partition" dict, that will contain each process 
# in the original data and what community it belongs to
histodf = pd.DataFrame.from_dict(partition, orient="index", columns=["communitylabel"])

# Group this new dataframe by the Count of items per community label.  
# This will show us the size of each community.
newgrouped = histodf.groupby(["communitylabel"]).agg({"communitylabel": ["count"]})

print(newgrouped)

In [None]:
# We now need to identify any communities that are statisically smaller than the others, 
# indicating that they are the most rare.
# To do this: Create a boxplot of the counts in each community, and identify as "anomalous" 
# if the community size is below the first quartile of the boxplot. 
fig2 = px.box(newgrouped["communitylabel"]["count"], points="outliers")
fig2.update_traces(quartilemethod="exclusive")

threshold = newgrouped["communitylabel"]["count"].describe()["25%"]  # this is the first quartile

# The Inter-Quartile Range (IQR) is defined as the value of the 75th quartile - value of 25th quartile.
# If you would like to use the lower fence of the boxplot as the threshold instead of the first quartile
# (which will reduce false positives at a cost of potentially increasing false negatives), 
# uncomment the following line:
#threshold = newgrouped['communitylabel']['count'].describe()["25%"] - 1.5 * (newgrouped['communitylabel']['count'].describe()["75%"] - newgrouped['communitylabel']['count'].describe()["25%"])

# Create a dataframe of any community labels identified as statistically small (and therefore anomalous)
anomalousvalues = newgrouped[newgrouped["communitylabel"]["count"] <= threshold]

print(anomalousvalues)
fig2.show()

In [None]:
# For any outliers found, return the defined community from the original dataset
rarecommunities = pd.DataFrame()  # empty df

for communitylabel in anomalousvalues.index:
    rarecommunities = rarecommunities.append(
        histodf[histodf["communitylabel"] == communitylabel]
    )

print(rarecommunities)

In [None]:
# Create an Output dataframe, which consists of any identified anomalies from the original data 

if rarecommunities.shape[0] > 0:
    output = pd.DataFrame()
    for communitylabel in rarecommunities["communitylabel"]:
        for column in df.columns:
            output = output.append(df[(df[column] == 
                                   (rarecommunities.loc[rarecommunities["communitylabel"] == communitylabel].index[0]))])

    # Clean the data up a bit
    output.drop_duplicates(inplace=True)

print("Identified " + str(output.shape[0]) + " anomalously small communities")

In [None]:
# Display the original data from any small communities identified
output

In [None]:
# Dump any results to CSV for further investigation
output.to_csv("rarecommunities.csv")