In [None]:
# imports

import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from glob import glob
import networkx as nx
import itertools
from collections import defaultdict
import re
import nltk
import json

import matplotlib.pyplot as plt
from matplotlib_venn import venn3
import plotly.express as px
import plotly.graph_objects as go
import plotly

In [None]:
# setups

pd.options.display.max_columns = 50
nltk.download('punkt')

In [None]:
# dicts which will contain data frames for each field of knowledge
bio = {}
chem = {}
ph = {}

In [None]:
# load all dfs
for domain, data_dict in tqdm(zip(['biology', 'chemistry', 'physics'], [bio, chem, ph])):
    for path in glob(f'{domain}/*.csv'):
        print(path)
        name = path.split('/')[-1].split('.')[0]
        data_dict[name] = pd.read_csv(path, index_col=0)

# Tags

In [None]:
# create sets of tags from each stack
bio_tags = set(bio['Tags'].TagName.tolist())
chem_tags = set(chem['Tags'].TagName.tolist())
ph_tags = set(ph['Tags'].TagName.tolist())
all_tags = bio_tags | chem_tags | ph_tags

# see venn graph of tags 
venn3([bio_tags, chem_tags, ph_tags], set_labels=['bio', 'chem', 'physics'])

In [None]:
# tags that are in all three domains
bio_tags & chem_tags & ph_tags

In [None]:
# only in bio and chem not in ph
bio_tags & chem_tags - ph_tags

In [None]:
# remove '<>' from tags
def split_tags(string):
    if isinstance(string, str):
        string = string.lstrip('<').rstrip('>')
        strings = string.split('><')
        return strings
    else:
        return []

In [None]:
for base in [bio, chem, ph]:
    base['Posts']['Tags'] = base['Posts']['Tags'].apply(split_tags)

In [None]:
bio['Posts']['Tags'].head()

## Graph analysis

In [None]:
# create graph where each node is a tag and they are conneted if they cooccur in the same post
# the more frequent they coocur the higher the weight
def create_graph(dbs):
    graph = nx.Graph()
    for db in dbs:
        graph.add_nodes_from(db['Tags'].TagName.tolist())
    tag_pairs_map = defaultdict(int)
    for db in dbs:
        for post_tags in db['Posts']['Tags'].tolist():
            tag_pairs = itertools.combinations(post_tags, r=2)
            for tag1, tag2 in tag_pairs:  # order
                if tag1 > tag2:
                    tag1, tag2 = tag2, tag1
                tag_pairs_map[(tag1, tag2)] += 1

    graph.add_weighted_edges_from(
        [(tag1, tag2, cnt) for (tag1, tag2), cnt in tag_pairs_map.items()]
    )
    return graph
    
tag_graph = create_graph([bio, chem, ph])
bio_graph = create_graph([bio])
chem_graph = create_graph([chem])
ph_graph = create_graph([ph])

In [None]:
tag_graph.number_of_nodes()

In [None]:
tag_graph.number_of_edges()

In [None]:
plt.figure(figsize=(20,20))
nx.draw_kamada_kawai(bio_graph, edge_color=(0,0,0,0.15))

In [None]:
plt.figure(figsize=(20,20))
nx.draw_spring(bio_graph, edge_color=(0,0,0,0.15))

In [None]:
# assign color to tags from domains and their combinations
color_assignment = {
    (True, False, False): 1,
    (False, True, False): 2,
    (False, False, True): 3,
    (True, True, False): 4,
    (True, False, True): 5,
    (False, True, True): 6,
    (True, True, True): 7,
}

tag_colors = []
for tag in tag_graph.nodes():
    color = color_assignment[(tag in bio_tags, tag in chem_tags, tag in ph_tags)]
    tag_colors.append(color)

In [None]:
# plot graph using kamada kawai algorithm
plt.figure(figsize=(20,20))
nx.draw_kamada_kawai(tag_graph, node_color=tag_colors, edge_color=(0,0,0,0.15))

## Tags count and frequency

In [None]:
# sort tags by frequency
bio_sorted_tags = sorted([(tag, d) for tag, d in bio_graph.degree()], reverse=True, key=lambda x: x[1])
chem_sorted_tags = sorted([(tag, d) for tag, d in chem_graph.degree()], reverse=True, key=lambda x: x[1])
ph_sorted_tags = sorted([(tag, d) for tag, d in ph_graph.degree()], reverse=True, key=lambda x: x[1])
bio_sorted_tags[:20]

In [None]:
# draw normalized distribution of tag frequency
go.Figure(
    [
        go.Scatter(
            x=list(range(len(bio_sorted_tags))),
            y=[i[1] / max(bio_sorted_tags, key=lambda x: x[1])[1] for i in bio_sorted_tags],
            hovertext=[i[0] for i in bio_sorted_tags],
            name='bio', line_color='green', mode='lines'
        ),
        go.Scatter(
            x=list(range(len(chem_sorted_tags))),
            y=[i[1] / max(chem_sorted_tags, key=lambda x: x[1])[1] for i in chem_sorted_tags],
            hovertext=[i[0] for i in chem_sorted_tags],
            name='chem', line_color='blue', mode='lines'
        ),
        go.Scatter(
            x=list(range(len(ph_sorted_tags))),
            y=[i[1] / max(ph_sorted_tags, key=lambda x: x[1])[1] for i in ph_sorted_tags],
            hovertext=[i[0] for i in ph_sorted_tags],
            name='ph', line_color='red', mode='lines'
        ),
    ]
    
)

## clustering by betweeness centrality 

In [None]:
bio_btw_centrality = nx.edge_betweenness_centrality(bio_graph, normalized=True, weight='weight')

In [None]:
bio_btw_centrality = sorted(bio_btw_centrality.items(), key=lambda x: x[1], reverse=True)

In [None]:
bio_graph_clusters = bio_graph.copy()

In [None]:
len(bio_btw_centrality)

In [None]:
to_remove = [i[0] for i in bio_btw_centrality[:13000]]
bio_graph_clusters.remove_edges_from(to_remove)

In [None]:
list(nx.connected_components(bio_graph_clusters))

In [None]:
sorted(nx.betweenness_centrality(bio_graph).items(), key=lambda x: x[1], reverse=True)[:30]

In [None]:
sorted(nx.betweenness_centrality(tag_graph).items(), key=lambda x: x[1], reverse=True)[:30]

## Tags through time

In [None]:
for db in [bio, chem, ph]:
    db['Posts']['CreationDate'] = pd.to_datetime(bio['Posts']['CreationDate'])

In [None]:
month = bio['Posts']['CreationDate'].dt.month
year = bio['Posts']['CreationDate'].dt.year
bio['Posts'].loc[:, 'y_month'] = year.astype(str) + '-' + month.astype(str).str.zfill(2)

In [None]:
all_tags_in_time = bio['Posts'].loc[:, ['y_month', 'Tags']].explode('Tags')
all_tags_in_time.loc[:, 'cnt'] = 1
all_tags_in_time = all_tags_in_time.groupby(['y_month', 'Tags']).count().reset_index()
all_tags_in_time = all_tags_in_time.sort_values(['y_month', 'Tags']).reset_index(drop=True)
top50 = all_tags_in_time.Tags.value_counts().head(50).index.tolist()
tags_in_time = all_tags_in_time.loc[all_tags_in_time.Tags.isin(top50)]

In [None]:
yr_month_cnt = all_tags_in_time.groupby('y_month')['cnt'].sum().to_dict()
tags_in_time.loc[:, 'freq'] = tags_in_time.cnt / tags_in_time.y_month.map(yr_month_cnt)

In [None]:
px.line(
    tags_in_time.groupby('y_month').sum().reset_index(),
    x='y_month',
    y='cnt', 
)

In [None]:
fig = go.Figure()
for tag, data in tags_in_time.groupby('Tags'):
    fig.add_trace(
        go.Scatter(
            x=data.y_month, y=data.freq,
            mode='lines', 
            line=dict(width=1),
            name=tag
        )
    )
fig.update_layout(
    width=1000,
    yaxis_range=[0, 0.08]
)

In [None]:
variable_tags = ['entomology', 'species-identification', 'human-biology', 'bioinformatics']
fig = go.Figure()
for tag, data in tags_in_time.loc[tags_in_time.Tags.isin(variable_tags)].groupby('Tags'):
    fig.add_trace(
        go.Scatter(
            x=data.y_month, y=data.freq,
            mode='lines', 
            line=dict(width=1),
            name=tag
        )
    )
fig.update_layout(
    width=1000,
    yaxis_range=[0, 0.08],
    yaxis_title='frequency of tag'
)

Here is a very interesting thing regarding entomology and species-identification tags. They tend to have strong periodity through time especially during the summer months. Possible explanation is that during those months there is a drastic increase in the abundance of organizms that people would like to identify.

Marek TODO

- zbadać zaleznosc pomiedzy frekwencja taga a nagrodami nobla


In [None]:
from statsmodels.tsa.seasonal import STL

In [None]:
spec_iden = tags_in_time.loc[tags_in_time.Tags=='species-identification']
stl = STL(spec_iden.freq, seasonal=13, period=12)
res = stl.fit()
fig = go.Figure(
    layout=dict(title='Species-identification tag occurance decomposition over time')
)
fig.add_traces(
    [
        go.Scatter(x=spec_iden.y_month, y=res.observed, line_shape='spline', name='observed'),
        go.Scatter(x=spec_iden.y_month, y=res.trend, line_shape='spline', name='trend'),
        go.Scatter(x=spec_iden.y_month, y=res.seasonal, line_shape='spline', name='seasonal'),
        go.Scatter(x=spec_iden.y_month, y=res.resid, line_shape='spline', name='residual', line_dash='dashdot'),
        
    ]
)

# Posts texts

In [None]:
# import 
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

In [None]:
import gensim

In [None]:
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import corpus

lemmatizer = WordNetLemmatizer() 

## Preprocessing

In [None]:
stopwords = corpus.stopwords.words('english')

In [None]:
# removal of html tags
html_tag = re.compile('<.*?>')
new_line = re.compile('\n')

def cleanhtml(raw_html):
    if isinstance(raw_html, str):
        cleantext = re.sub(html_tag, '', raw_html)
        cleantext = re.sub(new_line, ' ', cleantext)
        return cleantext
    else:
        return ''

In [None]:
for db in [bio, chem, ph]:
    db['Posts']['Body'] = db['Posts']['Body'].apply(cleanhtml)

In [None]:
bio['Posts']['Body'][1]

In [None]:
# the most important step of preprocessing
for db in [bio, chem, ph]:
    db['Posts']['words'] = None
    post_words = []
    for Id, data in db['Posts'].groupby(db['Posts'].index):
        words = nltk.word_tokenize(data['Body'].values[0])  # tokenize
        words = [lemmatizer.lemmatize(word) for word in words]  # lemmatize
        words = [word.lower() for word in words if word.isalpha()]  # to lower and remove non-words
        words = [word for word in words if word not in stopwords]  # remove stopwords
        post_words.append(words)
    db['Posts'].loc[:, 'words'] = post_words
del post_words

In [None]:
bio['Posts'].loc[:, 'domain'] = 'bio'
chem['Posts'].loc[:, 'domain'] = 'chem'
ph['Posts'].loc[:, 'domain'] = 'ph'

In [None]:
# decrease the number of posts for faster calculations and concatenate dfs
all_posts = pd.concat([bio['Posts'].sample(10000), chem['Posts'].sample(10000), ph['Posts'].sample(10000)])

In [None]:
posts_words = all_posts.words.apply(lambda x: ' '.join(x))

# transform words into tfidf vectors
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'}, max_features=3000)
X = vectorizer.fit_transform(posts_words)

In [None]:
X.shape

In [None]:
# reduce the dimensionality
X = PCA(50).fit_transform(X.toarray())

In [None]:
from sklearn.manifold import TSNE

In [None]:
# map 50D space into 2D with t-SNE
# calculates for 50 minutes! Just load the precalculated image
tsne = TSNE(2, perplexity=70, n_iter=3000)
mapped = tsne.fit_transform(X)

In [None]:
all_posts.loc[:, 'x_tsne'] = mapped[:, 0]
all_posts.loc[:, 'y_tsne'] = mapped[:, 1]

In [None]:
# visualize t-SNE mapping; show only posts with tags
mask = all_posts['Tags'].apply(lambda x: x != [])
fig = go.Figure()
for domain, data in all_posts.loc[mask].groupby('domain'):
    fig.add_trace(
        go.Scatter(
            x=data.loc[:, 'x_tsne'], y=data.loc[:, 'y_tsne'],
            marker=dict(size=3), mode='markers', name=domain,
            hovertext=data.loc[:, 'Tags'].apply(lambda x: ' '.join(x))
        )
    )
fig.update_layout(
    xaxis=dict(scaleanchor='y', scaleratio=1),
    width=1500, height=1500
).show('browser')

In [None]:
fig

In [None]:
# save calculated figure as json
fig.write_json('tsne.json')

In [None]:
# load json to make figure
plotly.io.from_json(open('tsne.json', 'r').read())

## Clustering of posts

In [None]:
clustering = AgglomerativeClustering(30).fit(X)

In [None]:
all_posts.loc[:, 'cluster'] = clustering.labels_

In [None]:
all_posts.loc[:, 'tags_joined'] = all_posts.loc[:, 'Tags'].apply(lambda x: ' '.join(x))

In [None]:
px.scatter(
    all_posts.loc[mask],
    x='x_tsne', y='y_tsne',
    color='domain', animation_frame='cluster',
    hover_data=['tags_joined']
).update_layout(
    xaxis=dict(scaleanchor='y', scaleratio=1),
    width=1000, height=1000
).show('browser')

In [None]:
# make df containing clusters, tags and domains
cluster_tags = dict(cluster=[], tags=[], domain=[])
for (cluster, Id), data in all_posts.groupby(['cluster', 'Id']):
    cluster_tags['domain'].append(data['domain'].values[0])
    cluster_tags['tags'].append(data['Tags'].values[0])
    cluster_tags['cluster'].append(cluster)
cluster_tags = pd.DataFrame(cluster_tags).explode('tags')

In [None]:
tags_per_cluster = pd.pivot_table(cluster_tags, index='cluster', columns='tags', aggfunc='count')\
.fillna(0).astype(int)['domain']

In [None]:
top_tags = cluster_tags.tags.value_counts().head(50).index.tolist()
tags_per_cluster.loc[:, top_tags]

In [None]:
cluster_tags.head()

In [None]:
# get top5 tags per each cluster

def f(x):
    return x.sort_values('domain', ascending=False).head(5).reset_index()['tags'].tolist()

top5 = cluster_tags.groupby(['cluster', 'tags']).count().groupby('cluster').apply(f)
for Id, tags in top5.iteritems():
    print(f'{Id}) {tags}')

## LDA modelling

In [None]:
# create mapping between words and their int ID
dictionary = gensim.corpora.Dictionary(all_posts.words.tolist())
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)

In [None]:
# translate words into their ID
corpus = [dictionary.doc2bow(text) for text in all_posts.words.tolist()]

In [None]:
# LDA modelling
lda_model = gensim.models.LdaModel(
    corpus,
    id2word=dictionary,
    num_topics=30,
    offset=2,
    random_state=100,
    update_every=1,
    passes=10,
    alpha='auto',
    eta="auto",
    per_word_topics=True
)

In [None]:
# Detected topics
for Id, formula in lda_model.print_topics():
    print(f'{formula}', end='\n\n')