# InfluxDB data loading

Writing orginal and preprocessed data into influxDB bu

In [None]:
from pyspark.sql import SparkSession
from influxdb_client import InfluxDBClient, Point
from influxdb_client.client.write_api import SYNCHRONOUS
import pandas as pd

Spark configuration

In [None]:
# Spark setup
spark = SparkSession.builder.appName("mema").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

The function reads data from a CSV file into a Spark DataFrame and writes the data to an InfluxDB database. It iterates through the Spark DataFrame, creating InfluxDB Points for each row and excluding the "timestamp" column. The function includes error handling to print any encountered exceptions during the process.

In [None]:
# Function for writing to spark
def process_spark_dataframe(spark, csv_file_path, influxdb_bucket):
    # Read CSV file into a Spark DataFrame
    spark_df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

    # InfluxDB setup
    influxdb_url = "http://localhost:8086"
    influxdb_token = "BD8-Z9Rcrb-lCOQcWJ7h-5kzuvX0ZIWMmlw8uza-1psB_8jjQOSbXC8XyOaWyjEBUWUZTXZtp-rYhnQPEZShxw=="
    influxdb_org = "mema_org"

    with InfluxDBClient(url=influxdb_url, token=influxdb_token, org=influxdb_org) as client:
        write_api = client.write_api(write_options=SYNCHRONOUS)

        # Iterate over rows in the Spark DataFrame
        for row in spark_df.collect():
            try:
                # Create InfluxDB Point
                point = Point("HAI_measurements")

                # Add fields to the InfluxDB Point, excluding "timestamp"
                for col_name in spark_df.columns:
                    if col_name != "timestamp":
                        col_value = row[col_name]
                        point.field(col_name, col_value)

                # Write the InfluxDB Point to the database
                write_api.write(influxdb_bucket, influxdb_org, point)

            except Exception as e:
                print(f"Error processing row: {e}")    

Setting the csv files path for the loading.

In [None]:
# Set CSV file path 
csv_test1 = "/Users/emmatosato/Documents/UNI_Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/merged_data/test_pd1.csv"
csv_test2 = "/Users/emmatosato/Documents/UNI_Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/merged_data/test_pd2.csv"
csv_complete = "/Users/emmatosato/Documents/UNI_Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/merged_data/complete_pd.csv"

csv_test1_proc = "/Users/emmatosato/Documents/UNI_Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/scaled_test1.csv"
csv_test2_proc = "/Users/emmatosato/Documents/UNI_Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/scaled_test2.csv"
csv_complete_proc = "/Users/emmatosato/Documents/UNI_Locale/Erasmus/OST/ost-sm-change-detection/data_analysis/preprocessed_data/scaled_complete.csv"

Calling the function for scaled and not scaled testing set, and also for the train1 and train 2 sets, merged together in the data analysis step.

In [None]:
process_spark_dataframe(spark, csv_test1, "Test1")
process_spark_dataframe(spark, csv_test2, "Test2")

In [None]:
process_spark_dataframe(spark, csv_test1_proc, "Test1Processed")
process_spark_dataframe(spark, csv_test2_proc, "Test2Processed")

In [None]:
process_spark_dataframe(spark, csv_complete, "CompleteTrain")
process_spark_dataframe(spark, csv_complete_proc, "CompleteTrainProcessed")