# Network analysis alt-tech all years Pushshift

## Import packages

In [2]:
# !pip install zstandard
# !pip install ujson
# !pip install stopwordsiso
# !pip install matplotlib
# !pip install nltk
# !pip install bs4

In [1]:
# Read data
import pickle
import pandas as pd
import zstandard as zstd
import json
import ujson
import io
import os

# SNA
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import re
from itertools import islice

# Domain study
from collections import Counter
# import urlexpander

import random
random.seed(2024)

In [2]:
# Error solving
np.float = float
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'
np. typeDict = np.sctypeDict

# NLP
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
import string
import stopwordsiso as stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
#importing the libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [3]:
# Messages

alt_df = pd.read_excel("E:\\._PhD\\Publications\\SocSem_Telegram\\Code\\alt_df.xlsx")

In [4]:
text = list(alt_df['text'].unique())
len(text)

1460

## Network building

In [16]:
# make edges direction B -> A (message was forwarded from B to A)
edges = list(zip(alt_df['fwd_from'], alt_df['to_id']))
print("Total number of edges:", len(edges))

Total number of edges: 3818


In [17]:
# Delete None values
edges = [e for e in edges if None not in e]
edges = [e for e in edges if not any(np.isnan(x) for x in e)]
print("Total number of non-None edge:", len(edges))

# Set weights
weights = dict(Counter(edges)) # Counter of number of existing forwards between each pair of channels
e = [(k[0], k[1], v) for k, v in weights.items()]

# # Normalize weights in e by total number of messages forwarded from each node
# e = [(k[0], k[1], v/len([x for x in edges if x[0] == k[0]])) for k, v in weights.items()]


Total number of non-None edge: 3427


In [18]:
nodes = list(set([item for sublist in edges for item in sublist]))
print("Total number of nodes:", len(nodes))

# check no duplicates
print("Number of duplicates:", len([k for k, v in dict(Counter(nodes)).items() if v > 1]))

Total number of nodes: 356
Number of duplicates: 0


In [19]:
G=nx.DiGraph()
G.add_nodes_from(nodes)
G.add_weighted_edges_from(e)
print("Number of isolated nodes:", len(list(nx.isolates(G))))

Number of isolated nodes: 0


In [70]:
print(G)

DiGraph with 356 nodes and 607 edges


In [21]:
# Print head of list of edges with associated weights
print(list(G.edges(data=True))[:10])

[(1116499975.0, 1446718427, {'weight': 18}), (1398738967.0, 1271719213, {'weight': 5}), (1398738967.0, 1161666782, {'weight': 5}), (1398738967.0, 1362651760, {'weight': 5}), (1398738967.0, 1201072738, {'weight': 5}), (1398738967.0, 1277771372, {'weight': 29}), (1398738967.0, 1250324144, {'weight': 5}), (1398738967.0, 1392836102, {'weight': 6}), (1398738967.0, 1314300626, {'weight': 4}), (1398738967.0, 1200042196, {'weight': 4})]


## Basic community detetion

### Detect

In [31]:
# !pip install git+https://github.com/taynaud/python-louvain.git


In [41]:
# Detect Louvain communities

from community import community_louvain # https://github.com/taynaud/python-louvain
partition = community_louvain.best_partition(G.to_undirected()) # {node: community ID}
p_descending = {k: v for k, v in sorted(partition.items(), key=lambda item: item[1], reverse=True)}
dict(islice(p_descending.items(), 0, 10))

{1490308239: 45,
 1101645400.0: 45,
 1443613812.0: 44,
 1421391763: 44,
 1000787017.0: 43,
 1007269511.0: 43,
 1000611791: 43,
 1476086765: 42,
 1470334558.0: 42,
 1001403366.0: 41}

In [42]:
modularity = community_louvain.modularity(partition, G.to_undirected())
print(f"Modularity of the partition: {modularity}")


Modularity of the partition: 0.714214331652633


In [43]:
# Create dict {C: L} where C is the community label and L is the list of nodes (channels) belonging to that community

from collections import defaultdict

communities = defaultdict(list)

for key, value in sorted(partition.items()):
    communities[value].append(key)

# dict(islice(communities.items(), 0, 3))

In [44]:
# Community size

c_size = {k: len(v) for k,v in communities.items()}
# dict(islice(c_size.items(), 0, 10))

print("Minimum community size:", min(c_size.values()))
print("Maximum community size:", max(c_size.values()))
print("Average community size:", sum(c_size.values())/len(c_size))
print("Share of communities of size 2:", sum([v for k,v in c_size.items() if v == 2])/len(c_size))

Minimum community size: 2
Maximum community size: 45
Average community size: 7.739130434782608
Share of communities of size 2: 1.0869565217391304


In [45]:
# Remove communities of size inferior or equal to n
nodes_to_remove = []
for community_id, nodes_in_community in communities.items():
  if len(nodes_in_community) <= 2:
    nodes_to_remove.extend(nodes_in_community)

partition2 = {node: community_id for node, community_id in partition.items() if node not in nodes_to_remove}

In [49]:
# Create communities from partition2

communities2 = defaultdict(list)

for key, value in sorted(partition2.items()):
    communities2[value].append(key)

# Create dictionary of community sizes from communities2
c_size2 = {k: len(v) for k,v in communities2.items()}

print("Minimum community2 size:", min(c_size2.values()))
print("Maximum community2 size:", max(c_size2.values()))
print("Average community2 size:", sum(c_size2.values())/len(c_size2))

Minimum community2 size: 3
Maximum community2 size: 45
Average community2 size: 14.571428571428571


In [None]:
# Save partition as pickle

# with open('E:\._PhD\Publications\SocSem_Telegram\data\partition.pkl', 'wb') as f:
#     pickle.dump(partition, f)

# with open('partition2.pkl', 'wb') as f:
#     pickle.dump(partition2, f)

In [69]:
# Save graph

# Add the community label to nodes

for node, community_id in partition.items():
    G.nodes[node]['community'] = community_id

nx.write_gexf(G, "E:\._PhD\Publications\SocSem_Telegram\data\graph.gexf")

In [68]:
G.nodes[1314300626]

{'community': 4}