# Functions

## Import libraries

In [1]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import re
import spacy
from nltk.corpus import stopwords

import networkx as nx
from itertools import combinations
from itertools import permutations

import pickle

from collections import OrderedDict

## Data processing

### • Preprocess data

In [2]:
def preprocess_data(df_posts, df_comments):
    
    # For posts data, replace any [removed], [deleted], NaN values with "" in the sub_body column then concatenate the sub_title and sub_body 
    df_posts.sub_body = df_posts.sub_body[~df_posts.sub_body.isin(['[removed]','[deleted]'])]
    df_posts['text'] = df_posts.sub_title + " " + df_posts['sub_body'].fillna("")
    
    # For comments data, remove rows that are [removed], [deleted], NaN
    df_comments = df_comments[~df_comments.com_body.isin(['[removed]','[deleted]'])]
    df_comments = df_comments[~df_comments.com_body.isnull()]
    
    # Rename columns 
    df_posts.rename(columns = {'sub_date':'date'}, inplace = True)
    df_comments.rename(columns = {'com_body':'text', 'com_date':'date'}, inplace = True)
    
    # Flatten data and order by date
    df_preprocessed = pd.concat([df_posts[['sub_id', 'text', 'date']], df_comments[['sub_id', 'text', 'date']]])
    df_preprocessed = df_preprocessed.sort_values(by='date', ascending=True)
    
    df_preprocessed['date'] = pd.to_datetime(df_preprocessed['date'])
    
    df_preprocessed['text'] = df_preprocessed['text'].str.replace(r'\beth\b', 'ethereum', case = False)
    df_preprocessed['text'] = df_preprocessed['text'].str.replace(r'\betc\b', 'ethereum', case = False)
    df_preprocessed['text'] = df_preprocessed['text'].str.replace(r'\blit\b', 'litecoin', case = False)
    df_preprocessed['text'] = df_preprocessed['text'].str.replace(r'\bltc\b', 'litecoin', case = False)
    df_preprocessed['text'] = df_preprocessed['text'].str.replace(r'\bbtc\b', 'bitcoin', case = False)
    df_preprocessed['text'] = df_preprocessed['text'].str.replace(r'\bdoge\b', 'dogecoin', case = False)
    
    return df_preprocessed

### • Split document based on time frame intervals

In [3]:
def get_timespan(df):
    start_date = np.datetime64(df.date.min(), 'D')
    end_date = np.datetime64(df.date.max(), 'D')
    time_span = (start_date, end_date)
    return time_span

In [4]:
def get_timeframes(time_span, delta):
    # Calculate number of time frames
    start_date, end_date = time_span
    if ((end_date - start_date)%delta == 0):
        N = int((end_date+1 - start_date) / delta)
    else:
        N = int((end_date+1 - start_date) / delta+1)
        
    # Get time-frame intervals then store them in start_date_list and end_date_list
    start_date_list = np.arange(start_date, end_date+1, delta, dtype='datetime64[D]')
    end_date_list = np.arange(start_date+delta, end_date+1, delta, dtype='datetime64[D]')
    if end_date_list[-1] < end_date+1:
        end_date_list = np.append(end_date_list, end_date+1)
    
    timeframe_list = [start_date_list, end_date_list]
    
    return N, timeframe_list

In [5]:
def get_formatted_timeframes_str(timeframe_list):
    start_date_list = timeframe_list[0] 
    end_date_list = timeframe_list[1]
    # Format timeframe intervals    
    formatted_timeframes_str = ['[ ' + str(start_date) + ' - ' + str(end_date) + ' ) 'for start_date, end_date in zip(start_date_list, end_date_list)]
    return formatted_timeframes_str

In [6]:
def split_doc_by_timeframe(timeframe_list, df):
    start_date_list = timeframe_list[0] 
    end_date_list = timeframe_list[1]
    N = len(start_date_list)
    
    # Split document into list of time-frame documents (doc_list[i]) 
    doc_list = [None] * N
    
    for i in range(N):
        start_date = str(start_date_list[i])
        end_date = str(end_date_list[i])
        mask = (start_date <= df['date']) & (df['date'] < end_date)
        df_doc = df.loc[mask]
        doc_list[i] = df_doc['text'].to_list()
         
    return doc_list

### • Process words (tokenisation, building N-grams and lemmatisation)

In [7]:
# Get stopwords from nltk
stop_words = stopwords.words('english')
stop_words.extend(['way', 'thank', 'address', 'lol', 'lot', 'people', 'info', 'year', 'sub', 'term', 'bread', 'file', 'question', 'word',
                   'think', 'use', 'say', 'fuck', 'come', 'understand', 'try', 'see', 'send', 'look', 'want', 'disagree', 'celsius', 'need',
                   'make', 'write', 'know', 'let', 'help', 'block',
                   'always', 'read', 'thing', 'hope', 'well', 'work', 'happen', 'congratulation', 'team', 'bag', 'complete', 'even', 
                   'full', 'happy', 'tip', 'color', 'sweet', 'find', 'talk', 'play', 'step', 'create', 'new', 'different', 'mention', 'name',
                   'aware', 'automatically', 'gotcha', 'check', 'generate', 'shoe', 'part', 'coffee', 'test', 'forget', 'yet', 'ever', 'shitty',
                   'fucking', 'really', 'right', 'still', 'much', 'never', 'also'])

def process_words(texts, stop_words=stop_words, allowed_tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    """Convert a document into a list of lowercase tokens, build bigrams-trigrams, implement lemmatisation"""
    
    # remove urls
    for i in range(len(texts)):
        texts[i] = re.sub(r'http\S+', '', texts[i])
        
    # build bigrams and trigrams models
    bigram = gensim.models.Phrases(texts, min_count=20, threshold=100)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    # remove stopwords, short tokens and letter accents 
    texts = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc in texts]
    
    # bi-gram and tri-gram implementation
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    # Using spacy 'en_core_web_sm'model with only tagger
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
    # implement lemmatisation and filter out unwanted part of speech tags
    texts_out = []
    
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_tags])
    
    # remove stopwords and short tokens again after lemmatisation
    texts_out = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc in texts_out]    
    
    return texts_out

In [8]:
def get_df_processed(formatted_timeframes_str, numOfSamples_list, doc_list_ready):
    df_processed = pd.DataFrame({'Timeframe interval': formatted_timeframes_str,
                                  'Sample size': numOfSamples_list,
                                  'Document': doc_list_ready})

    df_processed['Sample size']= df_processed['Sample size'].astype(int)
    df_processed.index = ['Timeframe ' + str(i+1) for i in range(len(doc_list_ready))]
    return df_processed

### • Create the Dictionary and Corpus for Topic Modeling

In [9]:
def convert_to_bagOfWords(doc_list_ready):
    N = len(doc_list_ready)
    # Create Dictionary
    id2word_list = [None] * N
    for i in range(N):
        id2word_list[i] = corpora.Dictionary(doc_list_ready[i])

    # Create Corpus: Term Document Frequency
    corpus_list = [None] * N
    for i in range(N):
        corpus_list[i] = [id2word_list[i].doc2bow(text) for text in doc_list_ready[i]]
    return id2word_list, corpus_list

## Topic Modelling

In [10]:
def get_topics_with_lda_models(corpus_list, id2word_list, num_topics_list, random_state = 100):
    num_of_timeframes = len(corpus_list)
    topics_list = [None] * num_of_timeframes
    
    for i in range(num_of_timeframes):
        if corpus_list[i] == []:
            topics_list[i] = []
            continue
        lda_model = gensim.models.LdaMulticore(corpus=corpus_list[i],
                                                       id2word=id2word_list[i],
                                                       num_topics=num_topics_list[i], 
                                                       random_state=random_state,
                                                       chunksize=100,
                                                       passes=10,
                                                       per_word_topics=True,
                                                       workers = 8)
        
        topics_scores_words = lda_model.show_topics(formatted=False)
        topics_words = [([word[0] for word in tp[1]]) for tp in topics_scores_words]
        topics = [None] * num_topics_list[i]
        for j in range(num_topics_list[i]):
            # Label each topic with the highest weightage word 
            topics[j] = topics_words[j][0]
        topics_list[i] = np.unique(topics)
    return topics_list

In [11]:
def get_lda_models(corpus_list, id2word_list, num_topics_list, random_state = 100):
    num_of_timeframes = len(corpus_list)
    lda_model_list = [None] * num_of_timeframes

    for i in range(num_of_timeframes):
        if corpus_list[i] == []:
            continue
        lda_model_list[i] = gensim.models.LdaMulticore(corpus=corpus_list[i],
                                                       id2word=id2word_list[i],
                                                       num_topics=num_topics_list[i], 
                                                       random_state=random_state,
                                                       chunksize=100,
                                                       passes=10,
                                                       per_word_topics=True,
                                                       workers = 8)
    return lda_model_list

In [12]:
def get_topics(lda_model_list, num_topics_list):
    num_of_timeframes = len(lda_model_list)
    topics_list = [None] * num_of_timeframes
    for i in range(num_of_timeframes):
        if lda_model_list[i] == None:
            topics_list[i] = []
        else:
            topics_scores_words = lda_model_list[i].show_topics(formatted=False)
            topics_words = [([word[0] for word in tp[1]]) for tp in topics_scores_words]
            topics = [None] * num_topics_list[i]
            for j in range(num_topics_list[i]):
                # Label each topic with the highest weightage word 
                topics[j] = topics_words[j][0]
            topics_list[i] = np.unique(topics)
    return topics_list

In [13]:
def get_df_topics(topics_list):
    max_len = max([len(i) for i in topics_list])
    df_topics = pd.DataFrame(topics_list)
    df_topics.index = ['Timeframe ' + str(i+1) for i in range(len(topics_list))]
    df_topics.columns = ['Topic ' + str(i+1) for i in range(max_len)]
    df_topics.fillna('-', inplace = True)
    return df_topics

## Network analysis

In [14]:
def get_node_pair_list(topics_list):
    node_pair_list = []
    for topics in topics_list:
        node_pair = list(permutations(topics, 2))
        node_pair_list += node_pair
    return node_pair_list 

In [15]:
def get_network_graph(topics_list):
    G = nx.Graph()
    nodes = list(set().union(*topics_list))
    G.add_nodes_from(nodes)
    node_pair_list = []
    for topics in topics_list:
        node_pair = list(combinations(topics, 2))
        G.add_edges_from(node_pair)
        node_pair_list += node_pair
    ebc = nx.edge_betweenness_centrality(G, normalized=False)
    nx.set_edge_attributes(G, ebc, 'betweenness')
    return G, node_pair_list  

## Others

In [16]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [17]:
def save_list(filename, mylist):
    with open(filename, 'wb') as f:
        pickle.dump(mylist, f)

In [18]:
def load_list(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [19]:
def get_sorted_dict(l):
    first2second = OrderedDict()
    for first, second in l:
        first2second.setdefault(first, []).append(second)
    result = [tuple(v) for v in first2second.values()]
    values = [sum(x) for x in result]
    keys = first2second.keys()
    d = dict(zip(keys, values))
    ranked_dict = dict(sorted(d.items(), key=lambda item: item[1], reverse = True))
    return ranked_dict