## Import libraries

In [1]:
import networkx as nx
import pandas as pd
import os
import glob
import matplotlib
import scipy
import time
from collections import Counter

## Build Network from CSVs

In [2]:
username_dict = {
    "samuel": "swang330",
    "alex": "alex.kalis",
    "alexandra": "alexandrapurdy_",
    "kabir": "kabir_aho",
    "kirin": "kirindanek",
    "liam": "liam_hochman",
    "noah": "noahpurow",
    "seoeun": "seoeunki.m",
    "will": "will.deley",
    "zach": "zach.annuik",
    "yota": "yota.katsikouli",
    "dis_abroad": "dis.copenhagen",
    "ely": "elybrayboy"
}


In [6]:
current_dir = os.path.dirname(os.getcwd())
csv_directory = os.path.join(current_dir, "final_project", "csv_files")

def build_network_from_csvs(csv_dir, exclude_user=None):
    G = nx.DiGraph()
    csv_files = glob.glob(os.path.join(csv_dir, "*_data.csv"))

    if not csv_files:
        print(f"error: no '*_data.csv' files found in directory: {csv_dir}")
        return None

    print(f"found {len(csv_files)} CSV files to process.")

    for csv_path in csv_files:
        filename = os.path.basename(csv_path)
        print(f"processing: {filename}...")

        parts = filename.replace("_data.csv", "").split('_')
        if len(parts) < 2:
            print(f"warning: skipping file with unexpected name format: {filename}")
            continue

        file_prefix = "_".join(parts[:-1])
        central_node = username_dict.get(file_prefix)

        #skip if excluding this user or if mapping failed
        if central_node == exclude_user or central_node is None:
            if central_node == exclude_user:
                 print(f"-> skipping file for excluded central node: {central_node}")
            else:
                print("warning: Skipping file - could not map prefix {} to a username: {}".format(file_prefix, filename))
            continue

        print(central_node) # Print the central node only if not skipped

        relationship_type = parts[-1]
        if relationship_type not in ["followers", "following"]:
            print(f"warning: Skipping file - cannot determine relationship type: {filename}")
            continue

        G.add_node(central_node)

        try:
            df = pd.read_csv(csv_path)
            if "username" not in df.columns:
                print(f"warning: Skipping file - 'username' column not found in {filename}")
                continue

            other_users = df["username"].dropna().unique()

            count = 0
            for user in other_users:
                #skip edges involving the excluded user
                if user == exclude_user:
                    continue

                G.add_node(user)

                if relationship_type == "followers":
                    G.add_edge(user, central_node)
                    count += 1
                elif relationship_type == "following":
                    G.add_edge(central_node, user)
                    count += 1

            print(f"-> added {count} edges for {central_node} ({relationship_type}).") # adjusted count

        except pd.errors.EmptyDataError:
             print(f"warning: Skipping empty file: {filename}")
        except Exception as e:
            print(f"error processing file {filename}: {e}")

    return G


In [7]:
network_graph = build_network_from_csvs(csv_directory)

found 26 CSV files to process.
processing: dis_abroad_followers_data.csv...
dis.copenhagen
-> added 4856 edges for dis.copenhagen (followers).
processing: samuel_following_data.csv...
swang330
-> added 1104 edges for swang330 (following).
processing: alex_following_data.csv...
alex.kalis
-> added 1675 edges for alex.kalis (following).
processing: zach_followers_data.csv...
zach.annuik
-> added 603 edges for zach.annuik (followers).
processing: liam_followers_data.csv...
liam_hochman
-> added 817 edges for liam_hochman (followers).
processing: dis_abroad_following_data.csv...
dis.copenhagen
-> added 240 edges for dis.copenhagen (following).
processing: samuel_followers_data.csv...
swang330
-> added 683 edges for swang330 (followers).
processing: liam_following_data.csv...
liam_hochman
-> added 1678 edges for liam_hochman (following).
processing: alex_followers_data.csv...
alex.kalis
-> added 1235 edges for alex.kalis (followers).
processing: zach_following_data.csv...
zach.annuik
-> add

### Degrees of class members

In [8]:
for username in username_dict.values():
    if network_graph.has_node(username):
        in_deg = network_graph.in_degree(username)
        out_deg = network_graph.out_degree(username)
        print(f"- {username}: In={in_deg}, Out={out_deg}")
    else:
        print(f"error: {username} not found in the full graph")

- swang330: In=685, Out=1105
- alex.kalis: In=1235, Out=1675
- alexandrapurdy_: In=711, Out=869
- kabir_aho: In=828, Out=2346
- kirindanek: In=825, Out=1535
- liam_hochman: In=817, Out=1678
- noahpurow: In=789, Out=1603
- seoeunki.m: In=1136, Out=1403
- will.deley: In=631, Out=704
- zach.annuik: In=604, Out=1076
- yota.katsikouli: In=147, Out=185
- dis.copenhagen: In=4857, Out=240
- elybrayboy: In=892, Out=1471


### Largest SCC

In [None]:
sccs = list(nx.strongly_connected_components(network_graph))
scc_sizes = [len(component) for component in sccs]

print(f"Found {len(scc_sizes)} strongly connected components.")

if scc_sizes:
    max_scc_size = max(scc_sizes)
    print(f"Size of the largest strongly connected component: {max_scc_size}")

    nodes_in_cycles = sum(size for size in scc_sizes if size > 1)
    print(f"Total number of nodes in non-trivial SCCs (size > 1): {nodes_in_cycles}")

    #find the largest SCC
    largest_scc_nodes = max(sccs, key=len)
    lscc_graph = network_graph.subgraph(largest_scc_nodes)
    print(f"LSCC Graph: Nodes={lscc_graph.number_of_nodes()}, Edges={lscc_graph.number_of_edges()}")


Found 14674 strongly connected components.
Size of the largest strongly connected component: 7278
Total number of nodes participating in non-trivial SCCs (size > 1): 7278
LSCC Graph: Nodes=7278, Edges=15067


In [13]:
class_members = list(username_dict.values())
individual_reciprocity = nx.reciprocity(network_graph, nodes=class_members)
for node, recip in individual_reciprocity.items():
        if network_graph.has_node(node) and network_graph.degree(node) > 0:
            print(f"- {node}: {recip:.4f}")
        else:
            print(f"- {node}: Error")

- swang330: 0.7385
- alex.kalis: 0.6460
- alexandrapurdy_: 0.7316
- kabir_aho: 0.4171
- kirindanek: 0.6415
- liam_hochman: 0.5796
- noahpurow: 0.5418
- seoeunki.m: 0.6010
- will.deley: 0.6172
- zach.annuik: 0.5571
- yota.katsikouli: 0.6084
- dis.copenhagen: 0.0094
- elybrayboy: 0.5942


### Diameter of directed graph

In [12]:
diameter_lscc = nx.diameter(lscc_graph)
print(f"Diameter of largest SCC (directed): {diameter_lscc}")


Diameter of largest SCC (directed): 6


Find actual path of diameter 6

In [18]:

eccentricities = nx.eccentricity(lscc_graph) #list max directions
source_node = None
for node, ecc in eccentricities.items():
    if ecc == diameter_lscc:
        source_node = node
        print(f"Found source node '{source_node}' with eccentricity {ecc}")
        break

# check if diameter matches path between nodes
if source_node:
    print(f"Calculating shortest paths from '{source_node}'...")
    path_lengths_from_source = nx.shortest_path_length(lscc_graph, source=source_node)
    target_node = None
    for node, length in path_lengths_from_source.items():
        if length == diameter_lscc:
            target_node = node
            print(f"Found target node '{target_node}' at distance {diameter_lscc}")
            break

    if target_node:
        actual_path = nx.shortest_path(lscc_graph, source=source_node, target=target_node)

        print(f"Length: {len(actual_path) - 1} (Matches diameter: {len(actual_path) - 1 == diameter_lscc})")
        print(f"Source: {source_node}")
        print(f"Target: {target_node}")
        print(f"Path: {actual_path}")

Found source node '__annamacdonald__' with eccentricity 6
Calculating shortest paths from '__annamacdonald__'...
Found target node 'miltiades_official' at distance 6
Length: 6 (Matches diameter: True)
Source: __annamacdonald__
Target: miltiades_official
Path: ['__annamacdonald__', 'dis.copenhagen', 'sally.bornhorst', 'zach.annuik', 'swang330', 'yota.katsikouli', 'miltiades_official']


In order for a node to be in a cycle of length 3 given the structure of our network and how it's built (outward from the core class members), in order for a node to be in a cycle of length 3 (asymetric or not) it must be in the largest SCC, therefore we're only to look at nodes in the LSCC to save on a lot of computation.

## Asymetric cycles of Length 3

In [19]:
asymmetric_cycles_len3 = []
print("\n--- Asymmetric 3-Cycle Search (Full LSCC) ---")
target_graph = lscc_graph
print(f"Starting search in 'lscc_graph' (Nodes: {target_graph.number_of_nodes()}, Edges: {target_graph.number_of_edges()})...")
start_time = time.time()

found_unique_cycles_set = set()
nodes_processed = 0
total_nodes = target_graph.number_of_nodes()

for u in target_graph:
    nodes_processed += 1
    for v in target_graph[u]:
        for w in target_graph[v]:
            #check for cycle of 3 and asymetry
            if w != u and target_graph.has_edge(w, u):
                is_symmetric = (target_graph.has_edge(v, u) and
                                target_graph.has_edge(w, v) and
                                target_graph.has_edge(u, w))
                if not is_symmetric:
                    cycle = [u, v, w]
                    #create unique identifier (sorted tuple of nodes)
                    sorted_cycle_tuple = tuple(sorted(cycle))
                    if sorted_cycle_tuple not in found_unique_cycles_set:
                        found_unique_cycles_set.add(sorted_cycle_tuple)
                        asymmetric_cycles_len3.append(cycle)

    if nodes_processed % 1000 == 0:
            print(f"  ... processed {nodes_processed}/{total_nodes} nodes...")

end_time = time.time()
found_count = len(asymmetric_cycles_len3)
print(f"\nSearch Complete. Found {found_count} unique asymmetric cycles of length 3.")
print(f"Time taken: {end_time - start_time:.2f} seconds.")

if asymmetric_cycles_len3:
    print(f"\nPrinting asymmetric length 3 cycles found:")
    for i, cycle in enumerate(asymmetric_cycles_len3):
        print(f"- Cycle {i+1}: {cycle}")



--- Asymmetric 3-Cycle Search (Full LSCC) ---
Starting search in 'lscc_graph' (Nodes: 7278, Edges: 15067)...
  ... processed 1000/7278 nodes...
  ... processed 2000/7278 nodes...
  ... processed 3000/7278 nodes...
  ... processed 4000/7278 nodes...
  ... processed 5000/7278 nodes...
  ... processed 6000/7278 nodes...
  ... processed 7000/7278 nodes...

Search Complete. Found 120 unique asymmetric cycles of length 3.
Time taken: 4.00 seconds.

Printing asymmetric length 3 cycles found:
- Cycle 1: ['anderss.k', 'zach.annuik', 'kirindanek']
- Cycle 2: ['anderss.k', 'zach.annuik', 'kabir_aho']
- Cycle 3: ['feyza_achilova', 'elybrayboy', 'noahpurow']
- Cycle 4: ['madigoeke', 'elybrayboy', 'noahpurow']
- Cycle 5: ['harshxk_', 'will.deley', 'alex.kalis']
- Cycle 6: ['cayeetanaaa', 'noahpurow', 'elybrayboy']
- Cycle 7: ['brownumemes', 'elybrayboy', 'noahpurow']
- Cycle 8: ['jake_levy61', 'will.deley', 'alex.kalis']
- Cycle 9: ['_annikasingh', 'noahpurow', 'elybrayboy']
- Cycle 10: ['libbydake

## All cycles of length 3

In [20]:
all_cycles_len3_distinct = []
if lscc_graph:
    print("\n--- All Simple 3-Cycle Search (Full LSCC) ---")
    target_graph = lscc_graph
    start_time = time.time()

    found_unique_cycles_set = set()

    for u in target_graph:
        for v in target_graph[u]:
            for w in target_graph[v]:
                # simple 3-cycle (distinct nodes)
                if w != u and v != w and target_graph.has_edge(w, u):
                    cycle = [u, v, w]
                    # identifier
                    sorted_cycle_tuple = tuple(sorted(cycle))
                    if sorted_cycle_tuple not in found_unique_cycles_set:
                        found_unique_cycles_set.add(sorted_cycle_tuple)
                        all_cycles_len3_distinct.append(cycle)

    end_time = time.time()
    found_count = len(all_cycles_len3_distinct)
    print(f"\nSearch Complete. Found {found_count} unique simple cycles of length 3.")
    print(f"Time taken: {end_time - start_time:.2f} seconds.")

    # Print a limited number
    PRINT_LIMIT = 20
    if all_cycles_len3_distinct:
        print(f"\nPrinting unique length 3 simple cycles found:")
        for i, cycle in enumerate(all_cycles_len3_distinct):
            print(f"-Cycle {i+1}: {cycle}")


--- All Simple 3-Cycle Search (Full LSCC) ---

Search Complete. Found 380 unique simple cycles of length 3.
Time taken: 4.01 seconds.

Printing unique length 3 simple cycles found:
-Cycle 1: ['samantha_klein__', 'noahpurow', 'elybrayboy']
-Cycle 2: ['anderss.k', 'zach.annuik', 'kirindanek']
-Cycle 3: ['anderss.k', 'zach.annuik', 'kabir_aho']
-Cycle 4: ['feyza_achilova', 'elybrayboy', 'noahpurow']
-Cycle 5: ['madigoeke', 'elybrayboy', 'noahpurow']
-Cycle 6: ['zabei.frank', 'liam_hochman', 'swang330']
-Cycle 7: ['harshxk_', 'will.deley', 'alex.kalis']
-Cycle 8: ['cayeetanaaa', 'noahpurow', 'elybrayboy']
-Cycle 9: ['savvyng', 'swang330', 'seoeunki.m']
-Cycle 10: ['brownumemes', 'elybrayboy', 'noahpurow']
-Cycle 11: ['jake_levy61', 'will.deley', 'alex.kalis']
-Cycle 12: ['_annikasingh', 'noahpurow', 'elybrayboy']
-Cycle 13: ['yota.katsikouli', 'swang330', 'kirindanek']
-Cycle 14: ['eledeff', 'liam_hochman', 'swang330']
-Cycle 15: ['_jackchin', 'liam_hochman', 'swang330']
-Cycle 16: ['libb

## Proportion Asymetric

In [21]:
proportional_asymmetric = {}
#use counters for easier edefinitions
asymmetric_cycles_counter = Counter(node for cycle in asymmetric_cycles_len3 for node in cycle)

cycles_counter_len3_total = Counter(node for cycle in all_cycles_len3_distinct for node in cycle)

print(f"Asymmetric counts calculated for {len(asymmetric_cycles_counter)} nodes.")
print(f"Total distinct counts calculated for {len(cycles_counter_len3_total)} nodes.")

#calculate proportion
for username, total_count in cycles_counter_len3_total.items():
    asymmetric_count = asymmetric_cycles_counter.get(username, 0)
    proportional_asymmetric[username] = asymmetric_count / total_count

# sorted results
print("\nProportion of node's 3-cycle participation that is asymmetric (Top 20):")
sorted_proportions = sorted(proportional_asymmetric.items(), key=lambda item: item[1] if item[1] is not None else -1, reverse=True)

PRINT_LIMIT = 20
for i, (node, proportion) in enumerate(sorted_proportions[:PRINT_LIMIT]):
        if proportion is not None:
            asym_count_disp = asymmetric_cycles_counter.get(node, 0)
            total_count_disp = cycles_counter_len3_total.get(node, 'Error')
            print(f"- {node}: {proportion:.4f} ({asym_count_disp}/{total_count_disp})")
        else:
            print(f"-{node}: error")


Asymmetric counts calculated for 123 nodes.
Total distinct counts calculated for 253 nodes.

Proportion of node's 3-cycle participation that is asymmetric (Top 20):
- anderss.k: 1.0000 (2/2)
- feyza_achilova: 1.0000 (1/1)
- madigoeke: 1.0000 (1/1)
- harshxk_: 1.0000 (1/1)
- cayeetanaaa: 1.0000 (1/1)
- brownumemes: 1.0000 (1/1)
- jake_levy61: 1.0000 (1/1)
- _annikasingh: 1.0000 (1/1)
- libbydakers: 1.0000 (1/1)
- myopiamilos: 1.0000 (1/1)
- katy_wales17: 1.0000 (1/1)
- baybater: 1.0000 (2/2)
- arsh_chow: 1.0000 (1/1)
- _harrison.brown_: 1.0000 (1/1)
- katrinatruong: 1.0000 (1/1)
- edgr_oh: 1.0000 (1/1)
- a.fitenko: 1.0000 (1/1)
- catherinejia_: 1.0000 (1/1)
- _kate.choi: 1.0000 (1/1)
- lucyc1ark: 1.0000 (1/1)


In [23]:
class_member_usernames = set(username_dict.values())
found_members = {}

for node, proportion in sorted_proportions:
    if proportion is not None and node in class_member_usernames:
        found_members[node] = proportion

if found_members:
    # sort class members
    for node, prop in sorted(found_members.items(), key=lambda item: item[1], reverse=True):
        in_deg = lscc_graph.in_degree(node) if lscc_graph.has_node(node) else 'N/A'
        out_deg = lscc_graph.out_degree(node) if lscc_graph.has_node(node) else 'N/A'
        print(f"- {node}: {prop:.4f} (LSCC In={in_deg}, Out={out_deg})")

- elybrayboy: 0.4214 (LSCC In=708, Out=723)
- noahpurow: 0.3709 (LSCC In=651, Out=684)
- liam_hochman: 0.3146 (LSCC In=724, Out=739)
- alex.kalis: 0.2787 (LSCC In=945, Out=951)
- swang330: 0.2376 (LSCC In=662, Out=674)
- kirindanek: 0.2340 (LSCC In=758, Out=768)
- zach.annuik: 0.2075 (LSCC In=475, Out=471)
- kabir_aho: 0.2000 (LSCC In=662, Out=670)
- will.deley: 0.1967 (LSCC In=414, Out=413)
- seoeunki.m: 0.1613 (LSCC In=764, Out=771)
- alexandrapurdy_: 0.1429 (LSCC In=579, Out=588)
- yota.katsikouli: 0.0000 (LSCC In=101, Out=101)
- dis.copenhagen: 0.0000 (LSCC In=156, Out=24)
