In [5]:
import pandas as pd


In [10]:

def preprocess(raw_data, reference_data):
    """
    Preprocess raw data by organizing entities, relations, and objects based on a reference dataset.

    Args:
    raw_data (pd.DataFrame): DataFrame of the raw data containing columns for subject, predicate, and object.
    reference_data (pd.DataFrame): DataFrame containing entity references.

    Returns:
    tuple: A tuple containing:
        - subject_documents (dict): Dictionary with subjects as keys and lists of predicates and objects as values.
        - vocab (list): Sorted list of the vocabulary.
        - new_data_df (pd.DataFrame): DataFrame containing processed triples.
        - all_subjects (list): List of all subjects found in the reference data.
    """
    
    # Filter raw_data to include subjects found in reference_data
    filtered_data = raw_data[raw_data['Subject'].isin(reference_data['subject'])]

    # Extracting subjects
    all_subjects = filtered_data['Subject'].unique().tolist()
    print(f"Number of subjects: {len(all_subjects)}")

    # Grouping predicates and objects by subjects
    grouped = filtered_data.groupby('Subject')
    # print('Grouping predicates and objects by subjects',grouped)

    subject_documents = {subj: group[['Predicate', 'Object']].values.flatten().tolist() for subj, group in grouped}
    print('subject_documents',subject_documents)

    # Creating vocabulary
    vocab = sorted(set(sum(subject_documents.values(), [])))  # Flattens the list of lists and then converts to a set for unique elements
    vocab_index = {w: i for i, w in enumerate(vocab)}

    # Creating a corpus with indexed vocabulary
    indexed_corpus = {subj: [vocab_index[word] for word in words] for subj, words in subject_documents.items()}

    return indexed_corpus, vocab, filtered_data, all_subjects, subject_documents




In [11]:
# Load the raw data as a DataFrame
try:
    raw_data = pd.read_csv('E:/course-phd/202009-project1/hlda/hlda/data/dbpedia/triples.txt', delimiter='\t', names=['Subject', 'Predicate', 'Object'])
    print('---------- Raw data loaded successfully ----------')

    # Load reference data and filter out unreferenced entities
    reference_data = pd.read_csv('E:/course-phd/202009-project1/hlda/hlda/data/dbpedia/classes.txt', delimiter='\t')
    reference_data = reference_data[reference_data['subject'].isin(raw_data['Subject'])].reset_index(drop=True)
    print(reference_data)

    # Preprocessing
    indexed_corpus, vocab, new_data_df, all_subjects, subject_documents = preprocess(raw_data, reference_data)

    # Output sample for verification
    print(f"Sample processed data:\n{new_data_df}")
    print(f"Sample indexed corpus for a subject:\n{next(iter(indexed_corpus.items()))}")
    print(f"Sample vocabulary:\n{vocab[:10]}")

except Exception as e:
    print(f"An error occurred: {e}")

---------- Raw data loaded successfully ----------
                                               subject  \
0               http://dbpedia.org/resource/Siren_Lake   
1    http://dbpedia.org/resource/Dogtrot_Lake_(Minn...   
2       http://dbpedia.org/resource/Lac_de_Montsalvens   
3        http://dbpedia.org/resource/Farmoor_Reservoir   
4             http://dbpedia.org/resource/Lake_Glenada   
..                                                 ...   
903         http://dbpedia.org/resource/Eamon_Sullivan   
904         http://dbpedia.org/resource/Miya_Tachibana   
905  http://dbpedia.org/resource/Chris_Thompson_(sw...   
906           http://dbpedia.org/resource/Anna_Kulkina   
907        http://dbpedia.org/resource/Ingeborg_Renner   

                                  level1  \
0     https://dbpedia.org/ontology/Place   
1     https://dbpedia.org/ontology/Place   
2     https://dbpedia.org/ontology/Place   
3     https://dbpedia.org/ontology/Place   
4     https://dbpedia.org/ontolo

In [None]:

doc_texts = list(subject_documents.values())