In [1]:
import warnings
warnings.simplefilter("ignore")

import os
import ast
import random
import logging
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from joblib import Parallel, delayed

from collections import OrderedDict, Counter
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.append('..')

In [2]:
from gensim.parsing import preprocessing
from gensim.utils import tokenize

def document_preprocess(text):
    first = text.encode('ascii', 'ignore').decode('utf-8').lower()
    second = preprocessing.remove_stopwords(first)
    third = preprocessing.strip_punctuation(second)
    fourth = preprocessing.strip_short(preprocessing.strip_numeric(third))
    return fourth

In [3]:
file = "../../../Starspace/data/oms/text/oms-all_raw.txt"

In [4]:
all_df = pd.read_csv("C:/Users/harshasivajit/Documents/master-ai/rr13/OmniScience/ArXiv_BMED_Evise_title_abstract_os_2019-01-14.tsv",
                     sep="\t", encoding="utf-8")

In [5]:
len(all_df["file_id"].unique())

853142

In [6]:
q = all_df[all_df.isna().any(axis=1)]
p1 = all_df[all_df["parentconceptid"].isna()]
p2 = all_df[all_df["title"].isna()]
r = q[q.columns[q.columns!='abstract']]
u = r[r.isna().any(axis=1)]

sub = all_df[all_df["abstract"].notnull()]

In [7]:
sub.columns

Index(['abstract', 'conceptid', 'file_id', 'label', 'level', 'parentconceptid',
       'parentlabel', 'parentpath', 'path', 'pathlabels', 'rootconceptid',
       'rootlabel', 'title', 'used_as'],
      dtype='object')

In [8]:
[sub[i].isna().sum() for i in sub.columns]

[0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 3, 0]

In [9]:
all_df["abstract"].isna().sum()

11459

In [10]:
all_df.shape[0] - all_df.dropna().shape[0]

11467

In [11]:
sub_df = sub[["abstract", "conceptid", "file_id", "label", "title", "used_as"]]
sub_df = sub_df[sub_df["used_as"] != "unused"]

groupeddf = sub_df.groupby("used_as")

In [12]:
sub_df

Unnamed: 0,abstract,conceptid,file_id,label,title,used_as
0,The fixed point that governs the critical beha...,189721267,ArXiv:0205046v2,Particle Physics,Chiral phase transitions: focus driven critica...,validation
1,We investigate the effects of impurities on th...,342949555,ArXiv:0205048v2,Disordered System,Competition between fluctuations and disorder ...,validation
2,When a high voltage (~30 kV) is applied to a c...,170589438,ArXiv:0211001v2,Physics,Force on an Asymmetric Capacitor,validation
3,We use quantum walks to construct a new quantu...,257942154,ArXiv:0311001v9,Quantum Physics,Quantum walk algorithm for element distinctness,validation
4,Consider laying out a fixed-topology tree of N...,190195160,ArXiv:0410048v4,Algorithms,Worst-Case Optimal Tree Layout in External Memory,validation
5,Consider laying out a fixed-topology tree of N...,190195160,ArXiv:0410048v4,Algorithms,Worst-Case Optimal Tree Layout in External Memory,validation
6,Consider laying out a fixed-topology tree of N...,190195160,ArXiv:0410048v4,Algorithms,Worst-Case Optimal Tree Layout in External Memory,validation
7,Consider laying out a fixed-topology tree of N...,190195160,ArXiv:0410048v4,Algorithms,Worst-Case Optimal Tree Layout in External Memory,validation
8,Consider laying out a fixed-topology tree of N...,190195160,ArXiv:0410048v4,Algorithms,Worst-Case Optimal Tree Layout in External Memory,validation
9,A new method for constructing minimum-redundan...,191884351,ArXiv:0509015v4,Information Theory,Optimal Prefix Codes with Fewer Distinct Codew...,validation


In [13]:
# tt = pd.DataFrame(columns=["file_id", "abstract", "labels", "label_id", "used_as"])
def group_fn(undo_join, group_id):
    tt = {}
    temp_df = undo_join.get_group(group_id)
    #if len(temp_df) > 1:        
    fileid = group_id
    abstract = list(set(list(temp_df["abstract"])))[0]
    labels = list(set(list(temp_df["label"])))
    label_id = list(set(list(temp_df["conceptid"])))
    usedas = list(set(list(temp_df["used_as"])))[0]
    tt = {"file_id": fileid,
         "abstract": abstract,
         "labels": labels,
         "label_id": label_id,
         "used_as": usedas}

    return tt

In [None]:
new_df = []
for gid, gvals in groupeddf.groups.items():
    
    tempdf = groupeddf.get_group(gid)
    sub_df2 = tempdf[["abstract", "conceptid", "file_id", "label", "title", "used_as"]]    
    sub_df2 = sub_df2.drop_duplicates()
    undo_join = sub_df2.groupby("file_id")

    out = Parallel(n_jobs=12, prefer="threads")(delayed(group_fn)(undo_join, group_id) for group_id, group_vals in tqdm(undo_join.groups.items()))
    new_df.append(out)    

In [None]:
rer = pd.DataFrame(new_df[0]).append(pd.DataFrame(new_df[1]))

In [None]:
rer = rer.reset_index()

In [4]:
main = "../../../Starspace/data/oms/text/jan_oms.tsv"

In [5]:
rer = pd.read_csv(main, sep="\t", encoding="utf-8")

In [6]:
# rer[rer["doc_len"] < 5].to_csv("weird.csv", index=False, encoding="utf-8")
rer["text"] = rer["abstract"].apply(lambda x: document_preprocess(x))

In [7]:
rer["doc_len"] = rer["text"].apply(lambda x: len(x.split(" ")))

In [18]:
list(rer[rer["doc_len"]==6]["text"])

['submitted notes discussions american journal physics',
 'paper withdrawn author similarity author paper',
 'pedagological introduction effective field theory presented',
 'formula matrix exponentials partial fraction decompositions',
 'research announcement theory orbifold quantum cohomology',
 'paper withdrawn authors crucial mistake lemma',
 'possible generalization method orbits slq discussed',
 'paper withdrawn author presented idea wrong',
 'paper withdrawn author presented idea wrong',
 'paper withdrawn material revised paper math',
 'proof generation gravitational waves physically impossible',
 'martin axiom imply automorphisms aleph trivial',
 'paper withdrawn unclearness notions material based',
 'simulation trading activity based implementation book',
 'present elliptic version selberg integral formula',
 'paper withdrawn mistakes proofs proposition theorem',
 'construct non separable algebra prime primitive',
 'proof set real numbers denumerable given',
 'paper gives condi

In [60]:
dum = rer[rer["doc_len"]>5]
dum = dum.drop(["text"], axis=1)

In [61]:
dum.to_csv(main, sep="\t", index=False, encoding="utf-8")

In [27]:
corpus = pd.read_csv("../../../Starspace/data/model/oms/oms-d128-init-h.tsv", encoding="utf-8", sep="\t", header=None)

In [30]:
words = list(corpus[0])

In [36]:
with open("../../../Starspace/data/oms/text/oms-valid.txt", "r") as f:
    r = f.readlines()

In [48]:
r[8207]

'__label__philosophy-methods __label__philosophic-inquiry highlightsnominalizers develop attitudinal stance markers insubordination nominalization constructions gives rise finite clauses nominalizers combine speaker semantic prosody encode speaker attitude nominalizers combine particles form complex mood markers complex attitudinal nominalizers shed light cartography mood markers\n'

In [45]:
words.pop(0)

'patients'

In [46]:
words[0]

'study'

In [None]:
t.append(t)

In [None]:
pd.DataFrame(t)

In [None]:
rer.shape

In [62]:
valid_indx = np.where(dum["used_as"]=="validation")[0]
train_indx = np.where(dum["used_as"]=="training")[0]
# test_indx = np.where(all_df["used_as"]=="unused")[0]

In [63]:
valid_indx

array([484129, 484130, 484131, ..., 583930, 583931, 583932], dtype=int64)

In [64]:
training_labels = dum.loc[train_indx, "label_id"]
valid_labels = dum.loc[valid_indx, "label_id"]

In [65]:
fil = open(file, "wb")
for idx in tqdm(rer.index):
    tmp = rer.at[idx, "abstract"] + "\n"
    fil.write(tmp.encode("utf-8"))
fil.close()

HBox(children=(IntProgress(value=0, max=589171), HTML(value='')))




In [66]:
with open(file, "rb") as fmain:
    reader=fmain.readlines()

In [67]:
preprocessed = []
for i, line in enumerate(tqdm(reader)):
    preprocessed.append(document_preprocess(line.decode("utf-8")))

HBox(children=(IntProgress(value=0, max=589171), HTML(value='')))




In [68]:
vectorizer = TfidfVectorizer(max_features = 2000, max_df=0.5)

In [69]:
vectorizer.fit(preprocessed)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=2000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [70]:
X = vectorizer.fit_transform(preprocessed)

In [71]:
X

<589171x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 23013575 stored elements in Compressed Sparse Row format>

In [None]:
0.001, 100 - 47818
0.01, 100 - 499619 

In [None]:
X[:3].todense()

In [None]:
t = set(training_labels)
v = set(valid_labels)

In [None]:
labels = pd.read_csv("../../../Starspace/data/model/oms/oms-d64-2-h.tsv", sep = "\t", encoding="utf-8", header=None)

In [None]:
labels.tail(5)

In [None]:
labes = pd.read_csv("../../../Starspace/data/model/oms/oms-d64-2-hless.tsv", sep = "\t", encoding="utf-8", header=None)

In [None]:
labes.tail(5)

In [None]:
labels = pd.read_csv("../../../Starspace/data/model/oms/oms-d20-ht.tsv", sep = "\t", encoding="utf-8", header=None)

In [None]:
from scipy.spatial import distance

In [None]:
labels.head(5)

In [None]:
np.where(labels[0]=="__label__historiography")

In [None]:
325310 + 614

In [None]:
u = np.array(labes.iloc[326137][1:]).astype(float)
v = np.array(labes.iloc[325992][1:]).astype(float)

In [None]:
1-distance.cosine(u,v)

In [None]:
X_train = X[train_indx]
X_valid = X[valid_indx]

In [None]:
tl = list(training_labels)
vl = list(valid_labels)

In [None]:
np.save("../../../Starspace/data/oms/tfidf/train_data.npy", X_train)
np.save("../../../Starspace/data/oms/tfidf/test_data.npy", X_valid)
np.save("../../../Starspace/data/oms/tfidf/train_labels.npy", tl)
np.save("../../../Starspace/data/oms/tfidf/test_labels.npy", vl)

In [None]:
y = np.load("../../hiercost/hiercost/data/oms/train_data.npy")

In [None]:
y.item()

In [None]:
g = -1

In [None]:
np.array(g).reshape(-1)

In [None]:
cnt = []
other = []
for ind in model.index:
    try:
        if "__label__" in model.at[ind, 0]:
            cnt.append(ind)
        else:
            other.append(ind)
    except:
        other.append(ind)
        print(ind, model.at[ind, 0])

In [None]:
model_labels = model.loc[cnt]

In [None]:
model_words = model.loc[other]

In [None]:
label_tsv = model_labels.to_csv("C:/Users/harshasivajit/Documents/Starspace/data/model/oms/oms-d64-hless-labels.tsv",
                               sep = "\t", columns=None, header=False, index=False)

In [None]:
updates_labels_tsv = pd.read_csv("C:/Users/harshasivajit/Documents/Starspace/data/model/oms/oms-d64-hless-labels.tsv",
                               sep = "\t", header=None)

In [None]:
model_labels.head(2)

In [None]:
updates_labels_tsv.head(2)

In [None]:
m = model_words.append(updates_labels_tsv, ignore_index=True)

In [None]:
m.to_csv("C:/Users/harshasivajit/Documents/Starspace/data/model/oms/oms-d64-hless.tsv",
                               sep = "\t", columns=None, header=False, index=False)

In [None]:
s.df["label"] = s.df["label"].apply(lambda x: ast.literal_eval(x)) 

In [None]:
s.df["vec"][0].shape

In [None]:
def return_x_y(df, split):
    x_ = list(df["vec"])
    x = np.vstack(x_)
    del x_
    
    y = list(df["label"])
    
    np.save("C:/Users/harshasivajit/Documents/Starspace/data/swiki/text/swiki_{}_data.npy".format(split), x)
    np.save("C:/Users/harshasivajit/Documents/Starspace/data/swiki/text/swiki_{}_labels.npy".format(split), y)
    return x, y

In [None]:
x,y = return_x_y(s.df, "train")

In [None]:
y[0]

In [None]:
a = [(6, 0), (6, 1), (1, 2), (0, 3), (1, 4), (0, 5)]

In [None]:
a = [(0,1), (0,2), (1,3), (2,3)]

In [None]:
b = [(30, 1), (30, 2), (2, 3), (1,3)]

In [None]:
G = nx.DiGraph()

In [None]:
G.add_edges_from(a)

In [None]:
T = EntityProcessor(path, ' ')

In [None]:
G = nx.read_graphml("C:/Users/harshasivajit/Documents/Starspace/data/oms/cat_hier_graph.graphml")

In [None]:
len(G.nodes())

In [None]:
def get_root(graph):
    
    root = [n for n in graph.nodes() if len(list(graph.predecessors(n)))==0]
    
    return root

In [None]:
root = get_root(G)

In [None]:
root

In [None]:
# to-do: similar for string
def BFS(s): 
    # applicable only for directed graphs
    visited = [0] * (len(G)) 
    traversal = []
    new_edges = []
    queue = [] 

    queue.append(s) 
    visited[s]+=1

    while queue: 

        s = queue.pop(0) 
        traversal.append(s)

        for i in G.neighbors(s): 
            queue.append(i) 
            visited[i] +=1
            if visited[i] != 1:
#                 pass
#                 new_edges.append((s, i+visited[i]+len(G)))
                new_edges.append((s, "{}-{}".format(s, i)))
            else:
                new_edges.append((s, i))
            
    return new_edges

In [None]:
# Converts DAG to tree. A LOT OF INFORMATION IS LOST THOUGH
def BFSs(s): 
    # applicable only for directed graphs
    visited = Counter() 
    traversal = []
    new_edges = []
    queue = [] 
    
    curr_node = s[0]
    queue.append(curr_node) 
    visited[curr_node]+=1
    while queue: 
        
        s = queue.pop(0) 
        traversal.append(s)

        for i in G.neighbors(s): 
            queue.append(i) 
            visited[i] +=1
            if visited[i] != 1:
#                 pass
#                 new_edges.append((s, i+visited[i]+len(G)))
                new_edges.append((s, "{}/{}".format(s, i)))
            else:
                new_edges.append((s, i))

    return new_edges

In [None]:
new_edges = BFSs(root)

In [None]:
r = nx.DiGraph()

In [None]:
r.add_edges_from(new_edges)

In [None]:
list(set(r.nodes()).difference(G.nodes()))[::-1]

In [None]:
nx.is_tree(r)

In [None]:
R = nx.convert_node_labels_to_integers(r)

In [None]:
plt.figure(figsize=(10,6))
nx.draw_circular(r, with_labels=True, node_size=500)