In [4]:
from extract_relations import get_directed_relations, order_directed_relations
from ontology_algorithm import construct_ontology_hierarchy, draw_hierarchy_tree_from_ontology
from topic_modeling import kmeans_tfidf_clustering
from preprocessing import get_word_types_with_tf_idf
import pandas as pd

In [5]:
# Preprocessing stage

# read csv file and drop any N/A rows from the dataframe 
# note to self: can I also use DEBRA for song lyrics depending on how I break it up?
df = pd.read_csv("../data/data_structures.csv").dropna()
print(df.head())

# generate unique list of chapter titles 
chapter_titles = df["chapter-title"].unique()

# generate list of each chapters text
chapter_texts = []
text = ''
for chapter in df["chapter"].unique():
    chapter_df = df.query(f'chapter == "{chapter}"')
    text = ''
    for chapter_text in chapter_df["text"].values:
        text += chapter_text + ' '
    chapter_texts.append(text)

                                                text    chapter  \
0  How many cities with more than 250,000 people ...  chapter-1   
1  Texas? How many people in my company make over...  chapter-1   
2  connect all of our telephone customers with le...  chapter-1   
3  answer questions like these, it is not enough ...  chapter-1   
4  must organize that information in a way that a...  chapter-1   

                    chapter-title  
0  Data Structures and Algorithms  
1  Data Structures and Algorithms  
2  Data Structures and Algorithms  
3  Data Structures and Algorithms  
4  Data Structures and Algorithms  


In [6]:
# topic modeling 
# the kmeans_tfidf_clustering function also includes the TF-IDF vectorization stage as well,
# which is part of the preprocessing stage in DEBRA's architecture 
num_topics = 4
clusters, cluster_terms = kmeans_tfidf_clustering(chapters = chapter_texts, num_topics = num_topics, n_key_terms = 50)

CLUSTER #1
Cluster Chapters: [1, 10, 15, 16]
Key Features: ['vertex', 'polynomial', 'graph', 'skip list', 'vertices', 'induction', 'edge', 'polynomial time', 'set', 'skip', 'matrix', 'shortest', 'solution', 'proof', 'figure', 'mod', 'pairing', 'knapsack', 'mst', 'input', 'algorithms', 'mathematical', 'prove', 'problems', 'edges', 'weight', 'integer', 'node', 'adjacency', 'program', 'return', 'theorem', 'clique', 'level', 'figure 11', 'relation', 'complete', 'pole', 'integers', 'log', 'recursive', 'path', 'base', 'computer', 'induction hypothesis', 'hard', 'numbers', 'positive', 'halt', 'hypothesis']
CLUSTER #2
Cluster Chapters: [4, 5, 9, 12]
Key Features: ['tree', 'node', 'nodes', 'child', 'figure', 'leaf', 'root', 'rt', 'trees', 'internal', 'children', 'binary', 'subtree', 'bst', 'parent', 'binary tree', 'records', 'right', 'record', 'search', 'binary trees', 'figure 13', 'quadtree', 'general tree', 'internal nodes', 'index', 'pointer', 'trie', 'huffman', 'implementation', 'public', '

In [4]:
# find top n words 

# have user enter a cluster, loop until they enter a correct option 
cluster_num = '---'
while cluster_num not in list(clusters.keys()):
    try:
        cluster_num = int(input(f'Enter a number in this list: {list(clusters.keys())}'))
    except ValueError:
        cluster_num = int(input(f'Invalid selection, enter a number in this list: {list(clusters.keys())}'))

# get the chapters that are in the chosen cluster 
cluster_chapters = clusters[cluster_num]
cluster_chapter_titles = [chapter_titles[index] for index in cluster_chapters]

# get the key terms from the chosen cluster 
terms = cluster_terms[cluster_num]

# generate a list of chapter text for each chapter in the chosen cluster 
cluster_chapter_text = []
for index in cluster_chapters:
    cluster_chapter_text.append(chapter_texts[index])

# retrieve dataframe of words with their word types (ex. noun, adjective) and term frequency values. Ordered by the tf column
pre_filtering = get_word_types_with_tf_idf(
    cluster_chapter_text,
    "tf",
    skip_stopwords = True
)

# set top_words to be the top 50 most occurrig words in the dataframe based on term frequency 
top_words = pre_filtering.head(50)["word"].values

# print out top words, the clusters key terms, and the first 50 dataframe rows (these will be the top words)
print(f'Top words are: {top_words}')
print(f'Key terms are: {terms}')
pre_filtering.head(50)


  tf = pd.concat([tf, row_df], ignore_index=True)
  idf = pd.concat([idf, row_df], ignore_index=True)


In [None]:
# extracting word relations 

# generate list of all sentences from the set of chapters in the chosen cluster 
sentences = []
for chapter_index in cluster_chapters:
    print(chapter_index)
    chapter_df = df.query(f'chapter == "chapter-{chapter_index}"')
    for text in chapter_df["text"].values:
        sentences.append(text)

# have user pick a sentence, loop until they enter a valid selection
user_sentence = -1
while user_sentence not in range(len(sentences)):
    try:
        user_sentence = int(input(f'Enter a number in the range 0 - {len(sentences)}'))
    except ValueError:
        user_sentence = int(input(f'Invalid selection, enter a number in the range 0 - {len(sentences)}'))

# import spacy (natural language processing library)
import spacy 

# load spacy model
nlp = spacy.load("en_core_web_lg")
doc = nlp(sentences[user_sentence])

# displaying word relations using user number
spacy.displacy.render(doc, style = 'dep', jupyter = True)

2
6
13
14


In [None]:
# get word relations 
# retrieve directed relations from given text and list of words to extract relations from
directed_relations, relations = get_directed_relations(
    top_n_words = top_words,
    all_verses = sentences,
    verbose = False
)

# order the relations based on the TF-IDF and number of relations with respect to the first word in the relation
ordered_relations = order_directed_relations(
    directed_relations = directed_relations,
    tf_idf_pre_filtering = pre_filtering,
    order_by = 'product', 
    verbose = False,
    include_ordering_wrt_occurences = True
) 

#construct hierarchy from ordered relations 
ontology_hierarchy, parent_words = construct_ontology_hierarchy(
    ordered_directed_relations = ordered_relations
)

# visualize ontology hierarchy using Graphviz
draw_hierarchy_tree_from_ontology(
    ontological_hierarchy = ontology_hierarchy,
    relations_to_verbs = relations
)

{('1', 'n'): 2, ('value', '0'): 1, ('search', 'n'): 3, ('example', 'search'): 1, ('example', 'value'): 1, ('k', 'case'): 1, ('example', 'cost'): 1, ('example', 'algorithm'): 1, ('search', 'case'): 1, ('0', '1'): 1}


In [7]:
# Run for each cluster
for cluster in range(1, num_topics + 1):
    print(f'Cluster: {cluster}')
    print(clusters[cluster])

1
2
3
4
