## Import packages

In [None]:
# !pip install zstandard
# !pip install ujson
# !pip install stopwordsiso
# !pip install matplotlib
# !pip install nltk
# !pip install bs4

In [None]:
# Read data
import pickle
import pandas as pd
import zstandard as zstd
import json
import ujson
import io
import os

# SNA
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import re
from itertools import islice

# Domain study
from collections import Counter
# import urlexpander

import random
random.seed(2024)

In [None]:
# Error solving
np.float = float
np.int = int   #module 'numpy' has no attribute 'int'
np.object = object    #module 'numpy' has no attribute 'object'
np.bool = bool    #module 'numpy' has no attribute 'bool'
np. typeDict = np.sctypeDict

# NLP
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
import string
import stopwordsiso as stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\167266\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
#importing the libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

## Read the data and instantiate the text

In [None]:
# Messages

with open("E:\\._PhD\\Data\\Telegram\\PushshiftChannels\\extractions\\all_messages_urls.json", 'rb') as f:
    messages = pickle.load(f)

print("Total number of messages, including duplicates:", len(messages))
print("Total number of unique messages:", len(set([m['message'] for m in messages])))

Total number of messages, including duplicates: 2669449
Total number of unique messages: 2081743


In [None]:
# # Delete messages shorter that 5 strings
# messages = [m for m in messages if len(m['message'].split(" ")) > 5]
# print("Total number of messages, excluding duplicates:", len(set([m['message'] for m in messages])))

Total number of messages, excluding duplicates: 1760623


## Pre-processing

In [None]:
msg_df = pd.DataFrame(messages)[['date','id','message','to_id','fwd_from']] # create dataframe
msg_df['to_id'] = msg_df['to_id'].apply(lambda x: x['channel_id']) # extract id of destination channel
msg_df['fwd_from'] = msg_df['fwd_from'].apply(lambda x: str(x['channel_id'])) # extract id of channel of origin

print("Total number of messages, excluding duplicates:", len(msg_df['message'].unique()))
print("Is the number of unique messages in the df equal to the number of unique messages in the list?", len(set([m['message'] for m in messages])) == len(msg_df['message'].unique()))

Total number of messages, excluding duplicates: 2081743
Is the number of unique messages in the df equal to the number of unique messages in the list? True


In [None]:
import tldextract

# Function to extract domain names
def extract_url(message):
    words = message.split()
    urls = []
    for word in words:
        ext = tldextract.extract(word)
        if ext.domain and ext.suffix:  # Checks if it’s a valid domain and suffix
            urls.append(f"{ext.subdomain}.{ext.domain}.{ext.suffix}".strip("."))
    return urls

# Apply the function to the message column
msg_df['url'] = msg_df['message'].apply(extract_url)

In [None]:
msg_df['url']

0                 [www.pewtrusts.org]
1                  [www.infowars.com]
2                   [siegekultur.biz]
3                       [twitter.com]
4                     [www.pakin.org]
                      ...            
2669444                        [t.me]
2669445                        [t.me]
2669446                        [t.me]
2669447    [telegram.me, telegram.me]
2669448                   [mixlr.com]
Name: url, Length: 2669449, dtype: object

In [None]:
msg_df.to_csv('E:\._PhD\Publications\SocSem_Telegram\Code\msg_df_with_urls.csv', index=False)