##  Integrating LLM-based topic modelling and user community detection on Telegram

### Import packages

In [1]:
# !pip install zstandard
# !pip install json
# !pip install ujson
# !pip install networkx
# !pip install matplotlib
# !pip install nltk
# !pip install stopwordsiso
# !pip install emoji
# !pip install bs4
# !pip install requests

In [2]:
# Read data
import pickle
import pandas as pd
import zstandard as zstd
import json
import ujson
import io
import os

# SNA
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import re
from itertools import islice

# Domain study
from collections import Counter
# import urlexpander

import random
random.seed(2024)

In [3]:
# Error solving
np.float = float    
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'
np. typeDict = np.sctypeDict

# NLP
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
import string
import stopwordsiso as stopwords

import emoji

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#importing the libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [5]:
# Create Zreader class because Zreader module will not load (https://github.com/pushshift/zreader)

class Zreader:

    def __init__(self, file, chunk_size=16384):
        '''Init method'''
        self.fh = open(file,'rb')
        self.chunk_size = chunk_size
        self.dctx = zstd.ZstdDecompressor()
        self.reader = self.dctx.stream_reader(self.fh)
        self.buffer = ''


    def readlines(self):
        '''Generator method that creates an iterator for each line of JSON'''
        while True:
            chunk = self.reader.read(self.chunk_size).decode('utf-8', 'ignore')
            if not chunk:
                break
            lines = (self.buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line

            self.buffer = lines[-1]

### Read data

In [6]:
# Channels

file = "E:/._PhD/Data/Telegram/PushshiftChannels/files-DO NOT DELETE/channels.ndjson.zst"
# Adjust chunk_size as necessary -- defaults to 16,384 if not specified
reader = Zreader(file)

# Read each line from the reader
channels = []
i=0
for line in reader.readlines():
    obj = json.loads(line)
    channels.append(obj)
#     print(obj)
    i+=1

In [7]:
# List of alt-tech platforms from Wikipedia (https://en.wikipedia.org/wiki/Alt-tech#List_of_alt-tech_platforms)
# # Excluding mainstream social media platforms and inactive platforms we couldn't trace a URL for
alt = {"Gab": "gab.com",
"Gettr" : "gettr.com",
"Parler" : "parler.com",
"Truth Social" : "truthsocial.com",
"BitChute" : "bitchute.com",
"DLive" : "dlive.tv",
"DTube" : "d.tube",
"Odysee" : "lbry.com",
"PewTube" : "pewtube.com",
"Rumble" : "rumble.com",
"Triller" : "triller.co",
"GiveSendGo" : "givesendgo.com",
"Hatreon" : "hatreon.net",
"SubscribeStar" : "subscribestar.com",
"GoyFundMe" : "goyfundme.com",
"MeWe" : "mewe.com",
"Minds" : "minds.com",
"Thinkspot" : "thinkspot.com",
"Patriots.win" : "patriots.win",
"Infogalactic" : "infogalactic.com",
"Metapedia" : "metapedia.org",
"8kun" : "8kun.top",
"WASP Love" : "wasplove.com",
"JustPaste.it" : "justpaste.it",
"Epik" : "epik.com"}

In [8]:
# Function to extract domain from URL string

def extract_domain(url, full=False):
  # Define a regular expression pattern for extracting the domain
  pattern = r"(https?://|http?://|http.?)?(www\d?\.)?(?P<domain>[\w\.-]+\.\w+)(/\S*)?"
  # Use re.match to search for the pattern at the beginning of the URL
  match = re.match(pattern, url)
  # Check if a match is found
  if match:
  # Extract the domain from the named group "domain"
    if not full:
        domain = match.group("domain")
        return domain
    else:
        return match
  else:
    return url

In [None]:
# Create edge list

# Define features to keep
keep = ['date','to_id', 'fwd_from', 'id', 'message'] # only keep messages that contain URLs

edges = []
messages = []

runner = 0

with open('E:\\._PhD\\Data\\Telegram\\PushshiftChannels\\files-DO NOT DELETE\\messages.ndjson.zst', 'rb') as dec:
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(dec)
    tr = io.TextIOWrapper(reader, encoding="utf-8")
    for line in tr:
        line_str = line.encode("utf-8")
        msg = ujson.loads(line_str)
        msg_f = {key: msg[key] for key in keep if (key in msg
                                                and 'fwd_from' in msg.keys()
                                                and msg['fwd_from'] != None
                                                and '2018-01' in msg['date']
                                                )}
        if msg_f != {}:
            edges.append((msg_f['to_id']['channel_id'],msg_f['fwd_from']['channel_id']))
            messages.append(msg_f)
        if len(edges)%100 == 0 and len(edges) != runner:
            runner = len(edges)
            print(runner)


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500


In [10]:
len(messages)

1442

In [54]:
len(edges)

3525

In [55]:
with open('E:\\._PhD\\Data\\Telegram\\PushshiftChannels\\extractions\\altech_all_edges_extract_domain.json', 'wb') as f:
    pickle.dump(edges, f)

In [56]:
with open('E:\\._PhD\\Data\\Telegram\\PushshiftChannels\\extractions\\altech_all_messages_extract_domain.json', 'wb') as f:
    pickle.dump(messages, f)

### Domain study

In [57]:
nodes = [item for sublist in edges for item in sublist]

# nodes.index(None)
# # There are 2 empty values due to messages where no channel id is indicated in 'fwd_from'
# # Delete
nodes = [n for n in nodes if n != None]

nodes = list(set(nodes))
print("Number of nodes: ", len(nodes))

# # check no duplicates
# dup = {k: v for k, v in dict(Counter(nodes)).items() if v > 1}
# len(dup)
# del(dup)

Number of nodes:  672


In [58]:
# INCOMING URLS BY CHANNELS

in_urls = {k: [] for k in nodes} # keys are all channels
for mm in messages:
    urls = re.findall(r'https\S+|http\S+', mm['message'])
    channel_id = mm['to_id']['channel_id']
    in_urls[channel_id].append(urls)

In [59]:
# OUTGOING URLS BY CHANNELS

out_urls = {k: [] for k in nodes} # keys are all channels
for mm in messages:
    urls = re.findall(r'https\S+|http\S+', mm['message'])
    try: 
        channel_id = mm['fwd_from']['channel_id']
        out_urls[channel_id].append(urls)
    except KeyError:
        pass

In [60]:
# Create the flat list of all shared urls
all_urls = [x for v in in_urls.values() for x in v]
all_urls.append([x for v in out_urls.values() for x in v])
all_urls = [x for xs in all_urls for x in xs]
a = [x for x in all_urls if type(x) != list]
b = [u for u in all_urls if type(u) == list]
b = [x for xs in b for x in xs]
all_urls = a+b
print("Total number of URLs shared: ", len(all_urls))
print("Number of unique URLs: ", len(set(all_urls)))

Total number of URLs shared:  12976
Number of unique URLs:  2304


In [61]:
len([x for x in all_urls if "bit.ly" in x or "tinyurl" in x])/len(all_urls)

0.0012330456226880395

In [62]:
# Extract domain from non-short links

domains = [extract_domain(x) for x in all_urls]

In [63]:
# Most shared domains

counter_domains = dict(Counter(domains))
# d_descending = {k: v for k, v in sorted(counter_domains.items(), key=lambda item: item[1], reverse=True)}
# dict(islice(d_descending.items(), 0, 500))

In [64]:
counter_alt = dict((k, counter_domains[k]) for k in list(set(alt.values()) & set(counter_domains.keys())))
counter_alt

{'d.tube': 71,
 'minds.com': 204,
 'dlive.tv': 209,
 'pewtube.com': 2,
 'gab.com': 290,
 'epik.com': 6,
 'parler.com': 145,
 'hatreon.net': 2,
 'lbry.com': 8,
 'justpaste.it': 731,
 'subscribestar.com': 162,
 'bitchute.com': 5922,
 'thinkspot.com': 1,
 'mewe.com': 162}