data prepare for nhdp

In [1]:
import pandas as pd

def preprocess(raw_data, reference_data):
    """
    Preprocess raw data by organizing entities, relations, and objects based on a reference dataset.

    Args:
    raw_data (pd.DataFrame): DataFrame of the raw data containing columns for subject, predicate, and object.
    reference_data (pd.DataFrame): DataFrame containing entity references.

    Returns:
    tuple: A tuple containing:
        - subject_documents (dict): Dictionary with subjects as keys and lists of predicates and objects as values.
        - vocab (list): Sorted list of the vocabulary.
        - new_data_df (pd.DataFrame): DataFrame containing processed triples.
        - all_subjects (list): List of all subjects found in the reference data.
    """
    
    # Filter raw_data to include subjects found in reference_data
    filtered_data = raw_data[raw_data['Subject'].isin(reference_data['entities'])]

    # Extracting subjects
    all_subjects = filtered_data['Subject'].unique().tolist()
    print(f"Number of subjects: {len(all_subjects)}")

    # Grouping predicates and objects by subjects
    grouped = filtered_data.groupby('Subject')
    # print('Grouping predicates and objects by subjects',grouped)

    subject_documents = {subj: group[['Predicate', 'Object']].values.flatten().tolist() for subj, group in grouped}
    print('subject_documents',subject_documents)

    # Creating vocabulary
    vocab = sorted(set(sum(subject_documents.values(), [])))  # Flattens the list of lists and then converts to a set for unique elements
    vocab_index = {w: i for i, w in enumerate(vocab)}

    # Creating a corpus with indexed vocabulary
    indexed_corpus = {subj: [vocab_index[word] for word in words] for subj, words in subject_documents.items()}

    return indexed_corpus, vocab, filtered_data, all_subjects, subject_documents

# Load the raw data as a DataFrame
try:
    raw_data = pd.read_csv('data/fb15k-237/train.txt', delimiter='\t', names=['Subject', 'Predicate', 'Object'])
    print('---------- Raw data loaded successfully ----------')

    # Load reference data and filter out unreferenced entities
    # reference_data = pd.read_csv('data/fb15k-237/freebase_reference_data_nhdp_20231030-154020.csv')
    # reference_data = pd.read_csv('data/fb15k-237/freebase_reference_data_nhdp_20231207-075638.csv')
    # reference_data = pd.read_csv('data/fb15k-237/freebase_reference_data_nhdp_20231215-160315.csv')
    # reference_data = pd.read_csv('data/fb15k-237/freebase_reference_data_nhdp_20240102-162052.csv')
    reference_data = pd.read_csv('data/fb15k-237/freebase_reference_data_nhdp_20240102-170616.csv')
    reference_data = reference_data[reference_data['entities'].isin(raw_data['Subject'])].reset_index(drop=True)
    print(reference_data)

    # Preprocessing
    indexed_corpus, vocab, new_data_df, all_subjects, subject_documents = preprocess(raw_data, reference_data)

    # Output sample for verification
    print(f"Sample processed data:\n{new_data_df}")
    print(f"Sample indexed corpus for a subject:\n{next(iter(indexed_corpus.items()))}")
    print(f"Sample vocabulary:\n{vocab[:10]}")

except Exception as e:
    print(f"An error occurred: {e}")


---------- Raw data loaded successfully ----------
                            classes    entities  \
0     wordnet_institution_108053576   /m/01y67v   
1     wordnet_institution_108053576  /m/026wmz6   
2     wordnet_institution_108053576   /m/012vwb   
3     wordnet_institution_108053576   /m/09f5vv   
4     wordnet_institution_108053576    /m/05q2c   
...                             ...         ...   
4452       wordnet_artist_109812338   /m/016gkf   
4453       wordnet_artist_109812338    /m/019fz   
4454       wordnet_artist_109812338   /m/012d40   
4455       wordnet_artist_109812338   /m/0cdf37   
4456       wordnet_artist_109812338    /m/04bgy   

                       level1classes  
0     wordnet_organization_108008335  
1     wordnet_organization_108008335  
2     wordnet_organization_108008335  
3     wordnet_organization_108008335  
4     wordnet_organization_108008335  
...                              ...  
4452        wordnet_person_100007846  
4453        wordnet_pers

In [None]:
import pickle

# Define the file path
file_path = 'data/fb15k-237/subject_documents.pkl'

# Save the subject documents to the file
with open(file_path, 'wb') as file:
    pickle.dump(subject_documents, file)

print(f"Subject documents saved to {file_path}")


In [None]:
unique_entities_total = reference_data['entities'].nunique()
print('unique entities',unique_entities_total)
print(len(indexed_corpus))
print(len(subject_documents))

In [None]:
# Analyzing the unique and multiple label samples based on the 'classes' column

# Counting the number of unique labels
unique_labels = reference_data['classes'].nunique()

# Grouping by entities to find out how many labels each entity has
labels_per_entity = reference_data.groupby('entities')['classes'].nunique()
print(labels_per_entity)

# Counting how many entities have a single label vs multiple labels
single_label_count = (labels_per_entity == 1).sum()
multiple_label_count = (labels_per_entity > 1).sum()

print('unique sencond hierarchy label',unique_labels)
print('single label count',single_label_count)
print('multiple label count',multiple_label_count)

In [None]:
import pandas as pd
from datetime import datetime

def word_count_func(text):
    '''
    Counts words within a string.
    
    Args:
        text (str): String to be processed.
    
    Returns:
        dict: A dictionary with words as keys and counts as values.
    ''' 
    counts = {}
    words = text.split() if isinstance(text, str) else text

    for word in words:
        counts[word] = counts.get(word, 0) + 1

    return counts

# Replace 'new_corpus' with your actual corpus data
new_corpus = indexed_corpus  # Your list of documents goes here
vocab = vocab  # Your list of vocabulary words goes here
all_data = {'doc_key':[],'doc_idx': [], 'word_idx': [], 'count': []}

# Generate a timestamp
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

for doc_idx, (doc_key, text_value) in enumerate(subject_documents.items()):
    words_count_dict = word_count_func(text_value)
    for word, count in words_count_dict.items():
        all_data['doc_key'].append(doc_key) # no for nhdp
        all_data['doc_idx'].append(doc_idx)
        word_idx = vocab.index(word) if word in vocab else -1  # -1 if the word is not found
        all_data['word_idx'].append(word_idx)
        all_data['count'].append(count)

corpus_df = pd.DataFrame(all_data)

# Save the DataFrame to a CSV file
filename = f'data/fb15k-237/corpus_po_{timestamp}.txt'
corpus_df_to_save = corpus_df[['doc_idx', 'word_idx', 'count']]  # Remove the 'doc_key' column
# corpus_df_to_save.to_csv(filename, sep=' ', index=False, header=False)
print(f'DataFrame saved to {filename}')

# Optional: Print statements (can be removed or commented out in production code)
print(len(all_data['doc_idx']))
print(corpus_df)
print(corpus_df[corpus_df['count'] > 1])  # Rows where count is greater than 1

# Create and save the vocabulary DataFrame
vocab_df = pd.DataFrame({'vocab': vocab})
vocab_filename = f'data/fb15k-237/vocab_po_{timestamp}.txt'
# vocab_df.to_csv(vocab_filename, index=False, header=False)
print(f'Vocabulary DataFrame saved to {vocab_filename}')
print(vocab_df.iloc[210])


save tree to evaluate

In [None]:
# command to use under git bash
! python print-tree.py '../nHDP_matlab/output/tree/freebase/20240102_nhdp_tree_freebase_po_t22_b1_1000_172952.csv' 'data/fb15k-237/vocab_po_20240102172952.txt'

In [None]:
# command to use under git bash
! python save_tree.py '../nHDP_matlab/output/tree/freebase/20240102_nhdp_tree_freebase_po_t22_b1_1000_172952.csv' 'data/fb15k-237/vocab_po_20240102172952.txt' 'output/test/20240102_nhdp_tree_freebase_po_t22_b1_1000_172952.csv'

In [8]:
save_tree_path = 'output/test/20240102_nhdp_tree_freebase_po_t22_b1_1000_172952.csv'
tree_df = pd.read_csv(save_tree_path)
print(tree_df)
# remove the level0 NAN value
tree_df = tree_df.drop('level0',axis=1)
tree_df['level1'] = tree_df['level1'].str.split(' ')
tree_df['level2'] = tree_df['level2'].str.split(' ')

print(tree_df)

   level0                                             level1  \
0     NaN  /people/person/profession /award/award_nominee...   
1     NaN  /people/person/profession /award/award_nominee...   
2     NaN  /m/09nqf /m/08mbj5d /common/topic/webpage./com...   
3     NaN  /m/09nqf /m/08mbj5d /common/topic/webpage./com...   

                                              level2  
0  /award/award_nominee/award_nominations./award/...  
1  /award/award_nominee/award_nominations./award/...  
2  /education/educational_institution/students_gr...  
3  /award/award_nominee/award_nominations./award/...  
                                              level1  \
0  [/people/person/profession, /award/award_nomin...   
1  [/people/person/profession, /award/award_nomin...   
2  [/m/09nqf, /m/08mbj5d, /common/topic/webpage./...   
3  [/m/09nqf, /m/08mbj5d, /common/topic/webpage./...   

                                              level2  
0  [/award/award_nominee/award_nominations./award...  
1  [/award/aw

In [9]:
print(list(subject_documents.values())[0])

['/people/person/places_lived./people/place_lived/location', '/m/01ktz1', '/common/topic/webpage./common/webpage/category', '/m/08mbj5d', '/people/person/places_lived./people/place_lived/location', '/m/0_xdd', '/people/person/profession', '/m/016z4k', '/award/award_nominee/award_nominations./award/award_nomination/award', '/m/0c4z8', '/people/person/nationality', '/m/09c7w0', '/award/award_nominee/award_nominations./award/award_nomination/award', '/m/02f6ym', '/people/person/gender', '/m/02zsn', '/people/person/spouse_s./people/marriage/spouse', '/m/02fn5r', '/award/award_nominee/award_nominations./award/award_nomination/award', '/m/01c99j', '/award/award_nominee/award_nominations./award/award_nomination/award', '/m/01dk00', '/people/person/place_of_birth', '/m/01ktz1', '/award/award_nominee/award_nominations./award/award_nomination/award', '/m/01c9jp', '/award/award_nominee/award_nominations./award/award_nomination/award', '/m/01by1l', '/award/award_winner/awards_won./award/award_hono

In [10]:
from gensim import corpora

texts = list(subject_documents.values())
dictionary = corpora.Dictionary(texts)

In [11]:
# htq
from evaluation.coherence import *

############################################# evaluation #############################################
phi_WB, d_B = [], []
phi_WL, d_L = [], []
for index,row_branch in tree_df.iterrows():

    branch_topics = row_branch.tolist()
    branch_topics = [topics for topics in branch_topics if topics]
    print(f'branch topics {index}', branch_topics)
    # phi_WB.append(compute_coherence(branch_topics, corpus, dictionary))
    phi_WB.append(compute_coherence_cv(branch_topics, texts, dictionary))
    d_B.append(compute_topic_diversity(branch_topics))


for level in tree_df.columns:
    # Convert each column to a list of tuples
    level_topics = [tuple(x) for x in tree_df[level].tolist()]
    level_topics = [topics for topics in level_topics if topics]
    print(f'level topics {level}', level_topics)
    # Get unique topics by converting the list to a set, then convert back to a list

    unique_topics = list(set(level_topics))

    # Print the unique topics
    print(f'unique topics in level {level}',unique_topics)

    # Compute coherence and topic diversity for each list of unique topics
    # phi_WL.append(compute_coherence(unique_topics, corpus, dictionary))
    phi_WL.append(compute_coherence_cv(unique_topics, texts, dictionary))
    d_L.append(compute_topic_diversity(unique_topics))

BTQ, LTQ = compute_btq_ltq(phi_WB, phi_WL, d_B, d_L)
HTQ = compute_HTQ(BTQ, LTQ)
print('BTQ_scores:', BTQ)
print('LTQ_scores:', LTQ)
print('HTQ:', HTQ)

branch topics 0 [['/people/person/profession', '/award/award_nominee/award_nominations./award/award_nomination/award', '/film/actor/film./film/performance/film', '/award/award_nominee/award_nominations./award/award_nomination/nominated_for', '/people/person/nationality'], ['/award/award_nominee/award_nominations./award/award_nomination/award_nominee', '/award/award_winner/awards_won./award/award_honor/award_winner', '/award/award_nominee/award_nominations./award/award_nomination/nominated_for', '/film/actor/film./film/performance/film', '/award/award_nominee/award_nominations./award/award_nomination/award']]
