In [2]:
from langdetect import detect
import re
from tqdm.auto import tqdm
import pickle
import pandas as pd
import unicodedata
import tmtoolkit
import numpy as np
from gensim.parsing.preprocessing import strip_punctuation
from TGDataset import db_utilities
import spacy


from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from multiprocessing import Pool
from transformers import BertTokenizer, BertModel
import torch
import os

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


In [3]:
def split_list(data, partition_size):
    return [data[i: i+partition_size] if i+partition_size< len(data) else data[i: len(data)] for i in range(0, len(data), partition_size)]


def get_corpus(channel):
    _id = channel['_id']
    discarded_messages = 0
    messages = channel['text_messages']
    len_messages = len(messages)
    messages = [messages[key]['message'] for key in messages if len(messages[key]['message']) > 25]
    discarded_messages += len_messages - len(messages)
    ok_messages = []
        
    for message in messages:
        if len(ok_messages) >= 100:
            break
        english = False
        if english == True:
            try:
                if detect(message)=='en': ok_messages.append(message)
                else: discarded_messages +=1
            except:
                pass
        else:
            ok_messages.append(message)
        
    single_corpus = ' '.join(ok_messages)

    return (single_corpus, _id, discarded_messages, ok_messages)


def save_as_pickle(text_list, outfile_name):
    with open('/media/teun/Hard Driver/TxMM/preprocessed_docs/'+outfile_name, 'wb') as fp:
        pickle.dump(text_list, fp)


def preprocess(channel):
    sp = spacy.load('en_core_web_sm',  disable=['parser', 'ner'])

    channel_tokens = []
    for message in channel:

        # Get lemma
        tokens = [token.lemma_ for token in sp(message)]

        # Normalize Unicode String and convert to lowercase
        tokens = [unicodedata.normalize('NFKD', token).lower() for token in tokens]

        #print('Removing all but chars and numbers...')
        tokens = [re.sub(r'[\W_]+', '',token) for token in tokens] 

        # Remove numbers, but not words that contain numbers.
        tokens = [token for token in tokens if not token.isnumeric()]

        # Remove words that are only one or two characters.
        tokens = [token for token in tokens if len(token) > 2]

        # Remove stopwords 
        stop_words = stopwords.words('english')
        stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'hi', 'ah', 'ha', 'joinchat', 'https', 'http', 'www', 'channel', 'join', 'bot', 'com', 'us'])
        tokens = [word for word in tokens if word not in stop_words]

        # Strip punctuation
        tokens = [strip_punctuation(token) for token in tokens] 

        channel_tokens += tokens
    
    return channel_tokens


def perform_preprocessing(portion_size=1, n_pool=2):
    print('Getting channels')
    df = pd.read_csv('TGDataset/labeled_data/channel_to_language_mapping.csv', sep='\t')
    channels = list(df)
    english = True
    if english == True:
        df_ = df[df['language']=='en']
        channels = list(df_['ch_id'])
    channels = db_utilities.get_channels_by_ids(channels, db_name='Telegram_test')

    portions = split_list(channels, portion_size)

    print('Starting preprocessing')
    for i, portion in tqdm.tqdm(enumerate(portions), total=len(portions)):
        corpus = []
        all_messages = []
        id_list = []
        discarded_messages = 0  
        with Pool(n_pool) as pool:
            for single_corpus, _id, s_discarded_messages, ok_messages in pool.map(get_corpus, portion):
                corpus.append(single_corpus)
                id_list.append(_id)
                all_messages.append(ok_messages)
                discarded_messages += s_discarded_messages
            
        save_as_pickle(id_list, f'ids_list_topic_modeling/n_gram_ids_list_topic_modeling_{i}')
        save_as_pickle(discarded_messages, f'discarded_messages_topic_modeling/n_gram_discarded_messages_topic_modeling_{i}')
        save_as_pickle(corpus, f'corpus/n_gram_corpus_{i}')
        save_as_pickle(all_messages, f'messages_per_channel/messages_{i}')

        docs = []
        with Pool(n_pool) as pool:
            for channel_tokens in tqdm.tqdm(pool.imap(preprocess, all_messages), total=len(all_messages)):
                docs.append(channel_tokens)
            
        texts = [' '.join(doc) for doc in docs]
            
        save_as_pickle(texts, f'texts_spacy/texts_topic_modeling_{i}')

# perform_preprocessing()

In [4]:
def open_pickle(filename):
    with open ('/media/teun/Hard Driver/TxMM/preprocessed_docs/'+filename, 'rb') as fp:
        saved_file = pickle.load(fp)

    return saved_file


def compute_coherence_values(corpus, k):

    def compute_coherence(lda_model, vectors, vocab): 
        return tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='u_mass', 
                            top_n=25, 
                            topic_word_distrib=lda_model.components_, 
                            dtm=vectors, 
                            vocab=np.array([x for x in vocab.keys()]),
                            return_mean=True)
    
    _, lda_model, vectors, count_vector = sk_LDA(corpus, k)
    
    coherence_model_lda = compute_coherence(lda_model, vectors, count_vector.vocabulary_)
    
    return coherence_model_lda


def sk_LDA(corpus, n_topic):
    en_stops = list(set(stopwords.words('english')).union(set(['jpg', 'src', 'png', 'mp4', 'mp3', 'ref', 'url', 'pdf'])))

    vectorizer = CountVectorizer(analyzer='word',       
                                min_df=0.01,
                                max_df=0.40,                       
                                stop_words= en_stops,#'english',
                                lowercase=True,                   
                                token_pattern='[a-zA-Z0-9]{3,}',  
                                max_features=10000,          
                                )

    data_vectorized = vectorizer.fit_transform(corpus)

    lda_model = LatentDirichletAllocation(n_components=n_topic, # Number of topics
                                        learning_method='online',
                                        random_state=0,       
                                        n_jobs =-1  # Use all available CPUs
                                        )
    lda_output = lda_model.fit_transform(data_vectorized)

    return lda_output, lda_model, data_vectorized, vectorizer


def perform_LDA(n_portions=20, min_topics=10, max_topics=31, step_size=1):
    texts = []
    id_list = []
    
    for idx in tqdm(range(n_portions)):
        new_texts = open_pickle(f'texts_spacy/texts_topic_modeling_{idx}')
        new_id_list = open_pickle(f'ids_list_topic_modeling/n_gram_ids_list_topic_modeling_{idx}')
        
        for k in range(len(new_texts)):
            texts.append(new_texts[k])
            id_list.append(new_id_list[k])

    # Topics range
    topics_range = range(min_topics, max_topics, step_size)

    model_results = {'Topics': [],'Coherence': []}

    # iterate through number of topics
    for k in tqdm(topics_range):
        
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=texts, k=k)
        # Save the model results
        model_results['Topics'].append(k)
        model_results['Coherence'].append(cv)
                    
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)

# perform_LDA(n_portions=10)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")
def bert_preprocess(channel, batch_size=2):
    batches = split_list(channel, batch_size)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    # model = BertModel.from_pretrained("bert-base-multilingual-cased")
    # print('calc embeddings')
    embeddings = []
    for batch in batches:
        # print(batch)
        messages = tokenizer(batch, return_tensors='pt', max_length=512, truncation=True, padding=True)
        embedding = model(**messages)['pooler_output'].detach()
        for e in embedding:
            embeddings.append(e.numpy())
    # return embeddings
    mean = np.mean(embeddings, axis=0)
    return mean

In [6]:
def preprocessing_bert(portion_size=1000, n_pool=1):
    # print('Loading Bert')
    # print('Getting channels')
    df = pd.read_csv('TGDataset/labeled_data/channel_to_language_mapping.csv', sep='\t')

    done_prev = open_pickle(f'processed_channels')
    done_channels = done_prev.keys()
    df = df[~df['ch_id'].isin(done_channels)]
    done_numbers = max(done_prev.values())
    iteration = done_numbers+1

    english = False
    if english == True:
        df_ = df[df['language']=='en']
        channels = list(df_['ch_id'])
    channels = list(df['ch_id'])
    portions = split_list(channels, portion_size)
    # print(len(channels))

    done = done_prev

    # print('Starting preprocessing')
    for i, portion in tqdm(enumerate(portions), total=len(portions)):
        print(len(portion))
        # print('Getting corpus')
        # print('Getting channels')
        channels = db_utilities.get_channels_by_ids(portion, db_name='Telegram_test')
        # print('Getting messages')
        corpus = []
        all_messages = []
        id_list = []
        discarded_messages = 0  
        with Pool(n_pool) as pool:
            for single_corpus, _id, s_discarded_messages, ok_messages in pool.map(get_corpus, channels):
                corpus.append(single_corpus)
                id_list.append(_id)
                all_messages.append(ok_messages)
                discarded_messages += s_discarded_messages

        save_as_pickle(id_list, f'ids_list_topic_modeling/n_gram_ids_list_topic_modeling_{i+iteration}')
        save_as_pickle(discarded_messages, f'discarded_messages_topic_modeling/n_gram_discarded_messages_topic_modeling_{i+iteration}')
        save_as_pickle(corpus, f'corpus/n_gram_corpus_{i+iteration}')
        save_as_pickle(all_messages, f'messages_per_channel/messages_{i+iteration}')
        # print(len(all_messages))
        # print('Calculating embeddings')
        embeddings = []
        for channel_messages in tqdm(all_messages):
            channel_embedding = bert_preprocess(channel_messages)
            embeddings.append(channel_embedding)
        
        save_as_pickle(embeddings, f'texts_bert/texts_topic_modeling_{i+iteration}')

        for channel in portion:
            done[channel] = i

        save_as_pickle(done, f'processed_channels')

preprocessing_bert(1000, 1)

  0%|          | 0/2 [00:00<?, ?it/s]

1000


  0%|          | 0/117 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


979


  0%|          | 0/160 [00:00<?, ?it/s]

In [5]:
doc = open_pickle(f'processed_channels')

In [None]:
from transformers import BertTokenizer, BertModel
import torch
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# model = BertModel.from_pretrained("bert-base-multilingual-cased")
text = "Replace me by any text you'd like like a longer text"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


In [None]:
print(encoded_input)
print(output['pooler_output'].shape)

{'input_ids': tensor([[  101, 72337, 72654, 10911, 10155, 11178, 15541, 13028,   112,   172,
         11850, 11850,   169, 20165, 15541,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 768])


In [None]:
def bert_preprocess(channel):
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained("bert-base-multilingual-cased")
    embeddings = []
    for message in channel:
        message = tokenizer(message, return_tensors='pt', max_length=512, truncation=True)
        try:
            embedding = model(**message)['pooler_output'].detach()
        except:
            print('failed')
            continue
        embeddings.append(embedding)
    return np.mean(embeddings)

In [None]:
done_prev = open_pickle(f'processed_channels')
done_numbers = done_prev.values()
print(max(done_numbers))

12


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")
def bert_preprocess(channel, batch_size=2):
    batches = split_list(channel, batch_size)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    # model = BertModel.from_pretrained("bert-base-multilingual-cased")
    # print('calc embeddings')
    embeddings = []
    for batch in batches:
        # print(batch)
        messages = tokenizer(batch, return_tensors='pt', max_length=512, truncation=True, padding=True)
        embedding = model(**messages)['pooler_output'].detach()
        for e in embedding:
            embeddings.append(e.numpy())
    # return embeddings
    mean = np.mean(embeddings, axis=0)
    return mean

In [None]:
channel = open_pickle(f'/texts_bert/texts_topic_modeling_0')

In [None]:
e = bert_preprocess(channel, 8)

In [None]:
e = bert_preprocess(channel, 2)

In [None]:
e = bert_preprocess(channel, 1)

In [None]:
df = pd.read_csv('TGDataset/labeled_data/channel_to_language_mapping.csv', sep='\t')
df.shape

(120979, 2)

In [None]:
len(list(df['ch_id']))

120979

In [None]:
channels = list(df['ch_id'])

In [None]:
print(len(channels))

120979


In [None]:
db_channels = db_utilities.get_channels_by_ids(channels, db_name='Telegram_test')

In [None]:
(db_channels)

[{'_id': 1000002626,
  'username': None,
  'creation_date': 1442918740,
  'title': 'Hyperadio',
  'description': 'HR official channel http://hyperadio.retroscene.org',
  'scam': False,
  'generic_media': {'75': {'title': None,
    'date': 1511324314,
    'author': None,
    'extension': '.jpe',
    'is_forwarded': False,
    'forwarded_from_id': None,
    'media_id': 'AgADAgADkKgxG5b0qEgkUZcki9xzQbXrAw4ADBwDAAIC',
    'forwarded_message_date': None},
   '64': {'title': 'xpeh — dina blasters_PT2_01.ogg',
    'date': 1491141922,
    'author': None,
    'extension': '.ogg',
    'is_forwarded': False,
    'forwarded_from_id': None,
    'media_id': 'CQADAgADDgADiLoISws0m14t-prqAg',
    'forwarded_message_date': None},
   '63': {'title': 'xpeh — magicxer_PT2_01.ogg',
    'date': 1491141870,
    'author': None,
    'extension': '.ogg',
    'is_forwarded': False,
    'forwarded_from_id': None,
    'media_id': 'CQADAgADDQADiLoIS7-QBEVFuG7dAg',
    'forwarded_message_date': None},
   '62': {'tit

In [None]:
len(channel)

71

In [None]:
corpus = open_pickle('/corpus/n_gram_corpus_0')

In [None]:
len(corpus)

71

In [None]:
ids = open_pickle('/ids_list_topic_modeling/n_gram_ids_list_topic_modeling_0')

In [None]:
len(ids)

71

In [None]:
list(doc.keys())

[1008309605,
 1009620245,
 1051563501,
 1058903397,
 1060738408,
 1098749357,
 1106613741,
 1137022322,
 1146983923,
 1360369018,
 1460507926,
 1000183358,
 1011455708,
 1050252884,
 1050514988,
 1054709267,
 1065719529,
 1074370071,
 1080399474,
 1110283958,
 1167955581,
 1207539348,
 1256560337,
 1277793819,
 1292211814,
 1315804794,
 1326814876,
 1348310617,
 1001494364,
 1003329464,
 1003853720,
 1005426627,
 1006212930,
 1007523621,
 1046845208,
 1148033008,
 1223792426,
 1332582177,
 1339660298,
 1454216998,
 1477810066,
 1485412155,
 1005426892,
 1006475428,
 1009359128,
 1009621337,
 1048418525,
 1080399952,
 1086429419,
 1132828803,
 1133877310,
 1146984771,
 1207802156,
 1238472847,
 1323931881,
 1347262809,
 1350408480,
 1463130336,
 1480431834,
 1003854222,
 1005951468,
 1025349986,
 1050253757,
 1054185827,
 1069390349,
 1070115450,
 1072011737,
 1076730261,
 1100323266,
 1103469024,
 1115527783,
 1119197545,
 1125136401,
 1145674352,
 1215928857,
 1223006612,
 1318951328,