# Tex4GCN

In [None]:
%pip install -q -U text4gcn

In [1]:
from text4gcn.datasets import data

data.list()

['R8', 'R52', 'AG_NEWS']

In [2]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [3]:
path = "data"

data.R8(path=path)
data.R52(path=path)
data.AG_NEWS(path=path)

In [4]:
!ls data

20AG_NEWS.meta	R52.meta  R8.cleaned  R8.node_features	R8.txt
20AG_NEWS.txt	R52.txt   R8.meta     R8.shuffled	log


In [5]:
from text4gcn.preprocess import TextPipeline

pipe = TextPipeline(
    dataset_name="R8",
    rare_count=5,
    dataset_path=path,
    language="english")

pipe.execute()



[nltk_data] Downloading package stopwords to /home/meriat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/meriat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/meriat/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[2022/5/29 01:36:06] [INFO] 
[2022/5/29 01:36:06] [INFO] Start process: 1653784566.447867
[2022/5/29 01:36:06] [WARN] Directory:'data/R8.cleaned' already exists, not overwritten.
[2022/5/29 01:36:07] [INFO] Rare-Count = <5>
[2022/5/29 01:36:07] [INFO] Elapsed time is 1.387645 seconds.
[2022/5/29 01:36:07] [INFO] 
[2022/5/29 01:36:07] [INFO] Start process: 1653784567.8356483
[2022/5/29 01:36:07] [WARN] Directory:'data/R8.shuffled/' already exists, not overwritten.
[2022/5/29 01:36:08] [INFO] Elapsed time is 0.480390 seconds.
[2022/5/29 01:36:08] [INFO] 
[2022/5/29 01:36:08] [INFO] Start process: 1653784568.3162353
[2022/5/29 01:36:15] [INFO] Elapsed time is 7.484410 seconds.
[2022/5/29 01:36:15] [INFO] 
[2022/5/29 01:36:15] [INFO] Start process: 1653784575.8010874
[2022/5/29 01:36:15] [WARN] Directory:'data/R8.node_features' already exists, not overwritten.
[2022/5/29 01:36:18] [INFO] x.shape=(4937, 300), y.shape=(4937, 8)
[2022/5/29 01:36:18] [INFO] tx.shape=(2189, 300), ty.shape=(2189

In [None]:
from text4gcn.builder import FrequencyAdjacency

adj = FrequencyAdjacency(
    dataset_name="R8",
    dataset_path=path
)

adj.build()

In [None]:
from text4gcn.builder import CosineSimilarityAdjacency

adj = CosineSimilarityAdjacency(
    dataset_name="R8",
    dataset_path="data"
)

adj.build()

In [None]:
from text4gcn.builder import EmbeddingAdjacency

adj = EmbeddingAdjacency(
    dataset_name="test",
    dataset_path="data",
    num_epochs=20,
    embedding_dimension=300,
    training_regime=1
)

adj.build()

In [None]:
from text4gcn.builder import DependencyParsingAdjacency

adj = DependencyParsingAdjacency(
    dataset_name="test",
    dataset_path="data",
    core_nlp_path="C:/bin/CoreNLP/stanford-corenlp-full-2018-10-05"
)

adj.build()

In [None]:
from text4gcn.builder import ConstituencyParsingAdjacency

#freq = ConstituencyParsingAdjacency()

In [None]:
from text4gcn.builder import LiwcAdjacency

adj = LiwcAdjacency(
    dataset_name="test",
    dataset_path="data",
    liwc_path="C:/bin/LIWC/LIWC2007_English100131.dic"
)

adj.build()

In [None]:
from text4gcn.models import Layer as layer
from text4gcn.models import GNN

gnn = GNN(
    dataset="R8",
    path="data",
    log_dir="log",
    layer=layer.GCN,
    epoches=200,
    dropout=0.5,
    val_ratio=0.1,
    early_stopping=10,
    lr=00.2,
    nhid=200
)

gnn.fit()

In [None]:
corpus = [
    "if you want to be happy be",
    "time is money",
    "i came i saw i conquered",
    #"",
    #"",
    #"",
    #"",
    #""
]

In [None]:
from typing import List, Iterable
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_vocabulary(docs_of_words: Iterable[List[str]]) -> List[str]:
    vocabulary = OrderedDict()
    for words in docs_of_words:
        vocabulary.update((word, None) for word in words)
    return list(vocabulary.keys())

def extract_word_definitions(vocabulary: List[str]) -> List[str]:
    from nltk.corpus import wordnet
    #from nltk import download
    #temporary_nltk_folder = 'venv/nltk_data/'
    #download(info_or_id='wordnet', download_dir=temporary_nltk_folder)
    #download(info_or_id='wordnet')
    #download(info_or_id='omw-1.4')

    merged_definitions_of_words = []
    for word in vocabulary:
        syn_sets_of_word = wordnet.synsets(word.strip())
        word_definitions = [syn_set.definition()
                            for syn_set in syn_sets_of_word]
        merged_definitions_of_word = ' '.join(word_definitions) if word_definitions else '<PAD>'

        merged_definitions_of_words.append(merged_definitions_of_word)
    # rmtree(temporary_nltk_folder)
    return merged_definitions_of_words

def extract_tf_idf_word_vectors(word_definitions: List[str], max_features: int) -> List[np.ndarray]:
    tf_idf_vectorizer = TfidfVectorizer(max_features=max_features)
    return tf_idf_vectorizer.fit_transform(word_definitions).toarray()

def word_to_vectors(vocabulary, word_vectors):
    return OrderedDict((word, vec.tolist()) for word, vec in zip(vocabulary, word_vectors))

In [None]:
# Build vocabulary
docs_of_words_generator = (line.split() for line in corpus)

vocabulary = extract_vocabulary(docs_of_words=docs_of_words_generator)

In [None]:
print(vocabulary)

In [None]:
# Extract word definitions
word_definitions = extract_word_definitions(vocabulary=vocabulary)

In [None]:
print(word_definitions[:3])

In [None]:
# Extract & Dump word vectors
word_vectors = extract_tf_idf_word_vectors(word_definitions=word_definitions, max_features=1000)

In [None]:
word_vectors[0]

In [None]:
word_to_word_vectors_dict = word_to_vectors(vocabulary, word_vectors)

In [None]:
word_to_word_vectors_dict.keys()