In [74]:
import pandas as pd 
from bs4 import BeautifulSoup

def extract_text(html_string):
    # Parse the HTML content
    soup = BeautifulSoup(html_string, 'html.parser')

    # Extract text from parsed HTML
    return soup.get_text()


df = pd.read_excel("data/registered.xlsx", sheet_name="Results")

# Convert Timestamps to strings in datetime columns
datetime_columns = df.select_dtypes(include=['datetime']).columns
df[datetime_columns] = df[datetime_columns].apply(lambda x: x.dt.strftime('%Y-%m-%d') if not pd.isnull(x).all() else x)
df = df.where(pd.notnull(df), None)

In [83]:
"""['ID', 'Name', 'Proponent', 'Project Type', 'AFOLU Activities',
    'Methodology', 'Status', 'Country/Area',
    'Estimated Annual Emission Reductions', 'Region',
    'Project Registration Date', 'Crediting Period Start Date',
    'Crediting Period End Date'
"""

df.rename(columns={
    'ID': 'project_id',
    'Name': 'name',
    'Proponent': 'proponent',
    'Project Type': 'project_type',
    'AFOLU Activities': 'afolu_activities',
    'Methodology': 'methodology',
    'Status': 'status',
    'Country/Area': 'country_area',
    'Estimated Annual Emission Reductions': 'estimated_annual_emission_reductions',
    'Region': 'region',
    'Project Registration Date': 'project_registration_date',
    'Crediting Period Start Date': 'crediting_period_start_date',
    'Crediting Period End Date': 'crediting_period_end_date'
}, inplace=True)

In [84]:
json_output = {}

# iterate over two columns, id and name zip them
for json_object in df.to_dict('records'):
    id = json_object['project_id']
    name = json_object['name']

    try:
        with open(f"data/registered/{id}.txt", 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Extracting the text
        extracted_text = extract_text(html_content)

        json_object['text'] = extracted_text
    except Exception:
        print(f"Error with {id}")
        continue


    json_output[id] = json_object
    

In [85]:
import json
with open('data/clean_registered.json', 'w') as fp:
    json.dump(json_output, fp, indent=4, sort_keys=True)

In [86]:

# Importing necessary modules for the Weaviate script
import weaviate

# Replace with your Weaviate instance URL
WEAVIATE_URL = "http://localhost:8080"

def setup_weaviate_connection():
    """ Setup the connection to the Weaviate instance """
    client = weaviate.Client(url=WEAVIATE_URL)
    return client


def import_data_to_weaviate(client, data):
    """ Import the prepared data into Weaviate """
    for obj in data:
        try:
            client.data_object.create(data_object=obj, class_name="Registed_Metadata")
        except Exception as e:
            print(f"Error importing object with ID {obj['project_id']}: {str(e)}")


In [87]:
client = setup_weaviate_connection()
import_data_to_weaviate(client, json_output.values())

In [90]:

def query_by_project_id(client, project_id):
    """
    Query data by project_id in Weaviate.

    Parameters:
    client_url (str): URL of the Weaviate instance.
    project_id (int): The project_id to query for.

    Returns:
    dict: The query result.
    """

    query = """
    {
        Get {
            TEST(where: {path: ["project_id"], operator: Equal, valueNumber: %d}) {
                crediting_period_end_date
                project_type
                status
                estimated_annual_emission_reductions
                region
                crediting_period_start_date
                afolu_activities
                methodology
                text
                project_id
                name
                proponent
                country_area
                project_registration_date
            }
        }
    }
    """ % project_id

    return client.query.raw(query)


In [91]:
query_by_project_id(client, 1)

{'data': {'Get': {'TEST': [{'afolu_activities': None,
     'country_area': 'India',
     'crediting_period_end_date': None,
     'crediting_period_start_date': None,
     'estimated_annual_emission_reductions': 13331,
     'methodology': 'AMS-I.D.',
     'name': '7.25 MW wind energy project of Aruppukottai Sri Jayavilas Ltd',
     'project_id': 1,
     'project_registration_date': '2009-03-17',
     'project_type': 'Energy industries (renewable/non-renewable sources)',
     'proponent': 'Aruppukottai Sri Jayavilas Limited',
     'region': 'Asia',
     'status': 'Late to verify',
     'text': 'The project activity consists of 17 wind turbine generators (WTGs) in Tirunelveli district of Tamil Nadu state in India. The project activity has 5 WTGs of 850 kW and 12 WTGs of 250 kW.'}]}}}