In [None]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from glob import glob
import networkx as nx
import itertools
from collections import defaultdict
import re
import nltk

import matplotlib.pyplot as plt
from matplotlib_venn import venn3
import plotly.express as px
import plotly.graph_objects as go

In [None]:
pd.options.display.max_columns = 50
nltk.download('punkt')

In [None]:
# dicts which will contain data frames for each field of knowledge
bio = {}
chem = {}
ph = {}

In [None]:
for domain, data_dict in tqdm(zip(['biology', 'chemistry', 'physics'], [bio, chem, ph])):
    for path in glob(f'{domain}/*.csv'):
        print(path)
        name = path.split('/')[-1].split('.')[0]
        data_dict[name] = pd.read_csv(path, index_col=0)

# Tags

In [None]:
bio_tags = set(bio['Tags'].TagName.tolist())
chem_tags = set(chem['Tags'].TagName.tolist())
ph_tags = set(ph['Tags'].TagName.tolist())
all_tags = bio_tags | chem_tags | ph_tags

venn3([bio_tags, chem_tags, ph_tags], set_labels=['bio', 'chem', 'physics'])

In [None]:
bio_tags & chem_tags & ph_tags

In [None]:
bio_tags & chem_tags - ph_tags

In [None]:
def split_tags(string):
    if isinstance(string, str):
        string = string.lstrip('<').rstrip('>')
        strings = string.split('><')
        return strings
    else:
        return []

In [None]:
for base in [bio, chem, ph]:
    base['Posts']['Tags'] = base['Posts']['Tags'].apply(split_tags)

In [None]:
bio['Posts']['Tags']

In [None]:
def create_graph(dbs):
    graph = nx.Graph()
    for db in dbs:
        graph.add_nodes_from(db['Tags'].TagName.tolist())
    tag_pairs_map = defaultdict(int)
    for db in dbs:
        for post_tags in db['Posts']['Tags'].tolist():
            tag_pairs = itertools.combinations(post_tags, r=2)
            for tag1, tag2 in tag_pairs:
                if tag1 > tag2:
                    tag1, tag2 = tag2, tag1
                tag_pairs_map[(tag1, tag2)] += 1

    graph.add_weighted_edges_from(
        [(tag1, tag2, cnt) for (tag1, tag2), cnt in tag_pairs_map.items()]
    )
    return graph
    
tag_graph = create_graph([bio, chem, ph])
bio_graph = create_graph([bio])
chem_graph = create_graph([chem])
ph_graph = create_graph([ph])

In [None]:
tag_graph.number_of_nodes()

In [None]:
tag_graph.number_of_edges()

In [None]:
plt.figure(figsize=(20,20))
nx.draw_kamada_kawai(bio_graph, node_color=tag_colors, edge_color=(0,0,0,0.15))

In [None]:
color_assignment = {
    (True, False, False): 1,
    (False, True, False): 2,
    (False, False, True): 3,
    (True, True, False): 4,
    (True, False, True): 5,
    (False, True, True): 6,
    (True, True, True): 7,
}

tag_colors = []
for tag in tag_graph.nodes():
    color = color_assignment[(tag in bio_tags, tag in chem_tags, tag in ph_tags)]
    tag_colors.append(color)

In [None]:
plt.figure(figsize=(20,20))
nx.draw_kamada_kawai(tag_graph, node_color=tag_colors, edge_color=(0,0,0,0.15))

In [None]:
bio_sorted_tags = sorted([(tag, d) for tag, d in bio_graph.degree()], reverse=True, key=lambda x: x[1])
chem_sorted_tags = sorted([(tag, d) for tag, d in chem_graph.degree()], reverse=True, key=lambda x: x[1])
ph_sorted_tags = sorted([(tag, d) for tag, d in ph_graph.degree()], reverse=True, key=lambda x: x[1])
bio_sorted_tags[:20]

In [None]:
go.Figure(
    [
        go.Scatter(
            x=list(range(len(bio_sorted_tags))),
            y=[i[1] / max(bio_sorted_tags, key=lambda x: x[1])[1] for i in bio_sorted_tags],
            hovertext=[i[0] for i in bio_sorted_tags],
            name='bio', line_color='green', mode='lines'
        ),
        go.Scatter(
            x=list(range(len(chem_sorted_tags))),
            y=[i[1] / max(chem_sorted_tags, key=lambda x: x[1])[1] for i in chem_sorted_tags],
            hovertext=[i[0] for i in chem_sorted_tags],
            name='chem', line_color='blue', mode='lines'
        ),
        go.Scatter(
            x=list(range(len(ph_sorted_tags))),
            y=[i[1] / max(ph_sorted_tags, key=lambda x: x[1])[1] for i in ph_sorted_tags],
            hovertext=[i[0] for i in ph_sorted_tags],
            name='ph', line_color='red', mode='lines'
        ),
    ]
    
)

## clustering by betweeness centrality - wip

In [None]:
bio_btw_centrality = nx.edge_betweenness_centrality(bio_graph, normalized=True, weight='weight')

In [None]:
bio_btw_centrality = sorted(bio_btw_centrality.items(), key=lambda x: x[1], reverse=True)

In [None]:
bio_graph_clusters = bio_graph.copy()

In [None]:
len(bio_btw_centrality)

In [None]:
to_remove = [i[0] for i in bio_btw_centrality[:13000]]
bio_graph_clusters.remove_edges_from(to_remove)

In [None]:
list(nx.connected_components(bio_graph_clusters))

In [None]:
sorted(nx.betweenness_centrality(bio_graph).items(), key=lambda x: x[1], reverse=True)[:30]

In [None]:
sorted(nx.betweenness_centrality(tag_graph).items(), key=lambda x: x[1], reverse=True)[:30]

# Posts texts

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

In [None]:
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import corpus

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer() 

In [None]:
nltk.download()

In [None]:
stopwords = corpus.stopwords.words('english')

In [None]:
html_tag = re.compile('<.*?>')
new_line = re.compile('\n')

def cleanhtml(raw_html):
    if isinstance(raw_html, str):
        cleantext = re.sub(html_tag, '', raw_html)
        cleantext = re.sub(new_line, ' ', cleantext)
        return cleantext
    else:
        return ''

In [None]:
for db in [bio, chem, ph]:
    db['Posts']['Body'] = db['Posts']['Body'].apply(cleanhtml)

In [None]:
bio['Posts']['Body'][1]

In [None]:
bio['Posts']['words'] = None
post_words = []
for Id, data in bio['Posts'].groupby(bio['Posts'].index):
    words = nltk.word_tokenize(data['Body'].values[0])
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stopwords]
    post_words.append(words)
bio['Posts'].loc[:, 'words'] = post_words

In [None]:
post_words = [' '.join(words) for words in post_words]

In [None]:
post_words[0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'}, max_features=3000)
X = vectorizer.fit_transform(post_words)

In [None]:
X.shape

In [None]:
X = PCA(50).fit_transform(X.toarray())

In [None]:
clusering = AgglomerativeClustering(50).fit(X)

In [None]:
# topic detection