In [1]:
import os
import ast
import random
import logging
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from joblib import Parallel, delayed

from collections import OrderedDict, Counter, defaultdict
from tqdm import tqdm_notebook as tqdm

from gensim.parsing import preprocessing
from gensim.utils import tokenize


In [2]:
df = pd.read_csv("../../../Starspace/data/oms/text/oms-prep.tsv", sep="\t")


In [8]:
df = df.dropna()

In [9]:
val = df[df["used_as"]=="validation"]

In [49]:
val.iloc[99504]

abstract    Highlights•Regional regression models of perce...
file_id                       EVISE.PII:S2214-5818(17)30315-4
label_id                               [189126970, 170590667]
labels                         ['Geology', 'Aquatic Science']
used_as                                            validation
doc_len                                                    34
doc         highlightsregion percentil flow creat basin de...
Name: 583635, dtype: object

In [11]:
graph = nx.read_graphml("../../../Starspace/data/oms/cat_hier_TREE_INT.graphml", node_type=int)

In [25]:
print(list(graph.predecessors(257778730)))

[189720899]


In [34]:
un_graph = graph.to_undirected()
nod = [250232503,253033933,
252578107,189721083,
210628467,249133392,
248811484,249672056,
189721148,253230766,
209895092]
par = [250232503, 252578107]

In [35]:
for p in par:
    for n in nod:
        try:
            p = nx.shortest_path_length(un_graph, n, p)
            print(n, p)
        except:
            print("no path")

250232503 0
no path
no path
no path
no path
no path
no path
no path
no path
no path
no path
250232503 9
no path
no path
no path
no path
no path
no path
no path
no path
no path
no path


In [2]:
def document_preprocess(text):
    first = text.encode('ascii', 'ignore').decode('utf-8').lower()
    second = preprocessing.remove_stopwords(first)
    third = preprocessing.strip_punctuation(second)
    fourth =preprocessing.strip_short(preprocessing.strip_numeric(third))
    return fourth

In [3]:
fopen = open("../../../Starspace/data/oms/text/oms-all_raw.txt", "rb")
file = fopen.readlines()

In [4]:
list_of_docs = []
for line in tqdm(file):
    preprocessed1 = document_preprocess(line.decode("utf-8"))
    list_of_docs.append(preprocessed1)

HBox(children=(IntProgress(value=0, max=583933), HTML(value='')))




In [5]:
vocabulary = []
word_count = Counter()

for doc in tqdm(list_of_docs):
    words = list(tokenize(doc))
    for word in words:
        vocabulary.append(word)
        word_count[word]+=1        

HBox(children=(IntProgress(value=0, max=583933), HTML(value='')))




In [6]:
unique_tokens = list(set(vocabulary))

In [7]:
# A storage to store tokens after subsampling
new_tokens = {}

# tokens is a list of word indexes from original text
for word in tqdm(unique_tokens):
    frac = word_count[word]/len(unique_tokens)
    prob = (np.sqrt(frac/0.001) + 1) * (0.001/frac)
    
    if prob > 0.2 :
        new_tokens[word]=prob
        
unique_words = list(new_tokens.keys())
corpus_specific_stopwords = set(unique_tokens).difference(unique_words)

HBox(children=(IntProgress(value=0, max=364521), HTML(value='')))




In [None]:
aim, analyze, drug, theories, variable, volume

In [8]:
corpus_specific_stopwordss_specific_stopwords

{'able',
 'according',
 'account',
 'accuracy',
 'accurate',
 'achieved',
 'acid',
 'action',
 'active',
 'activity',
 'addition',
 'additional',
 'age',
 'agreement',
 'aim',
 'algebra',
 'algebraic',
 'algebras',
 'algorithm',
 'algorithms',
 'allow',
 'allows',
 'alpha',
 'alternative',
 'analyses',
 'analysis',
 'analytical',
 'analyze',
 'analyzed',
 'and',
 'anti',
 'application',
 'applications',
 'applied',
 'apply',
 'approach',
 'approaches',
 'approximation',
 'arbitrary',
 'area',
 'article',
 'assessed',
 'associated',
 'association',
 'asymptotic',
 'atoms',
 'available',
 'average',
 'background',
 'band',
 'based',
 'basic',
 'basis',
 'beam',
 'behavior',
 'best',
 'beta',
 'better',
 'binary',
 'binding',
 'black',
 'blood',
 'body',
 'bound',
 'boundary',
 'bounded',
 'bounds',
 'brain',
 'calculated',
 'calculations',
 'called',
 'cancer',
 'capacity',
 'care',
 'case',
 'cases',
 'category',
 'cell',
 'cells',
 'central',
 'certain',
 'chain',
 'change',
 'changes'

In [8]:
from nltk.stem import *
snow = SnowballStemmer(language="english")

In [9]:
updated_doc_list = []
for doc in tqdm(list_of_docs):
    string = ""
    for word in doc.split(" "):
        if word not in corpus_specific_stopwords:
            word = snow.stem(word)
            string+="{} ".format(word)
            
    updated_doc_list.append(preprocessing.strip_multiple_whitespaces(string))
    

HBox(children=(IntProgress(value=0, max=583933), HTML(value='')))




In [10]:
fopen = open("../../../Starspace/data/oms/text/oms-all_raw_preprocessed.txt", "w+")


In [11]:
for doc in updated_doc_list:
    fopen.write("{}\n".format(doc))
    
fopen.close()