In [None]:
# Load all the CSV files
df_iaq2 = pd.read_csv('/mnt/data/IAQ2.csv')
df_iaq3 = pd.read_csv('/mnt/data/IAQ3.csv')
df_iaq4 = pd.read_csv('/mnt/data/IAQ4.csv')
df_iaq5 = pd.read_csv('/mnt/data/IAQ5.csv')

# Combine them into a single dataframe with an additional 'Sensor' column to identify the source
df_iaq1['Sensor'] = 'sensor01'
df_iaq2['Sensor'] = 'sensor02'
df_iaq3['Sensor'] = 'sensor03'
df_iaq4['Sensor'] = 'sensor04'
df_iaq5['Sensor'] = 'sensor05'

# Concatenate all dataframes
df_combined = pd.concat([df_iaq1, df_iaq2, df_iaq3, df_iaq4, df_iaq5], ignore_index=True)

# Convert the 'PKT' column to datetime format
df_combined['PKT'] = pd.to_datetime(df_combined['PKT'])

# Verify the combined dataframe structure and data types
df_combined.info(), df_combined.head()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error

# Function to plot the data for each sensor and parameter
def plot_parameter_data(df, parameter):
    sensors = df['Sensor'].unique()
    plt.figure(figsize=(14, 7))

    for sensor in sensors:
        # Filter the dataframe for each sensor
        sensor_data = df[df['Sensor'] == sensor]
        plt.plot(sensor_data['PKT'], sensor_data[parameter], label=sensor)

    # Calculate the median value for the parameter at each time point
    median_values = df.groupby('PKT')[parameter].median().reset_index(name='Median')
    plt.plot(median_values['PKT'], median_values['Median'], 'k--', label='Median')

    # Calculate the RMSE for each sensor and the median
    rmse_values = []
    for sensor in sensors:
        sensor_data = df[df['Sensor'] == sensor]
        merged_data = pd.merge(sensor_data, median_values, on='PKT')
        rmse = np.sqrt(mean_squared_error(merged_data[parameter+'_x'], merged_data['Median']))
        rmse_values.append(rmse)
    
    # Plot the median and RMSE on the graph
    plt.title(f'{parameter} Data for Sensors\nRMSE: {np.mean(rmse_values):.2f}')
    plt.xlabel('Timestamp')
    plt.ylabel(parameter)
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Show plot and return the average RMSE across all sensors for the parameter
    plt.show()
    return np.mean(rmse_values)

# Now let's plot for each parameter except 'PKT' and 'Sensor'
parameters = df_combined.columns.difference(['PKT', 'Sensor'])

# This could take some time, so let's plot for one parameter first to check the output
average_rmse = plot_parameter_data(df_combined, 'PM 2.5 (ug/m3)')
average_rmse


In [None]:
# Corrected function to plot the data for each sensor and parameter
def plot_parameter_data_corrected(df, parameter):
    sensors = df['Sensor'].unique()
    plt.figure(figsize=(14, 7))

    # Plot each sensor's parameter data
    for sensor in sensors:
        sensor_data = df[df['Sensor'] == sensor]
        plt.plot(sensor_data['PKT'], sensor_data[parameter], label=sensor)

    # Calculate and plot the median value for the parameter
    median_values = df.groupby('PKT')[parameter].median()
    plt.plot(df['PKT'].unique(), median_values, 'k--', label='Median')

    # Calculate the RMSE for each sensor
    rmse_values = {}
    for sensor in sensors:
        sensor_data = df[df['Sensor'] == sensor]
        rmse = np.sqrt(mean_squared_error(sensor_data[parameter], median_values.reindex(sensor_data['PKT']).values))
        rmse_values[sensor] = rmse

    # Plot the RMSE on the graph
    avg_rmse = np.mean(list(rmse_values.values()))
    plt.title(f'{parameter} Data for Sensors\nRMSE: {avg_rmse:.2f}')
    plt.xlabel('Timestamp')
    plt.ylabel(parameter)
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Show plot
    plt.show()

    return avg_rmse

# Let's plot again for PM 2.5 data
average_rmse_pm25 = plot_parameter_data_corrected(df_combined, 'PM 2.5 (ug/m3)')
average_rmse_pm25


In [None]:
# Filter the data for the month of January
df_january = df_combined[df_combined['PKT'].dt.month == 1]

# Now let's plot for each parameter for January
january_rmses = {}
for parameter in parameters:
    print(f"Plotting for {parameter} (January data)...")
    january_rmses[parameter] = plot_parameter_data_corrected(df_january, parameter)

# Display the average RMSE for each parameter for January
january_rmses


In [None]:
# Filter the data for the date range from January 20, 2024 to January 31, 2024
df_january_20_to_31 = df_combined[(df_combined['PKT'] >= '2024-01-20') & (df_combined['PKT'] <= '2024-01-31')]

# Now let's plot for each parameter for the specified date range
january_20_to_31_rmses = {}
for parameter in parameters:
    print(f"Plotting for {parameter} (20 Jan 2024 to 31 Jan 2024 data)...")
    january_20_to_31_rmses[parameter] = plot_parameter_data_corrected(df_january_20_to_31, parameter)

# Display the average RMSE for each parameter for the specified date range
january_20_to_31_rmses


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Function to create scatter plots with and without intercept for linear regression
def scatter_plot_linear_regression(df, parameter):
    # Prepare data for linear regression
    X_median = df.groupby('PKT')[parameter].median().values.reshape(-1, 1)
    y_values = df[parameter].values

    # Linear regression with intercept
    model_with_intercept = LinearRegression(fit_intercept=True)
    model_with_intercept.fit(X_median, y_values)
    y_pred_with_intercept = model_with_intercept.predict(X_median)
    r2_with_intercept = r2_score(y_values, y_pred_with_intercept)

    # Linear regression without intercept
    model_without_intercept = LinearRegression(fit_intercept=False)
    model_without_intercept.fit(X_median, y_values)
    y_pred_without_intercept = model_without_intercept.predict(X_median)
    r2_without_intercept = r2_score(y_values, y_pred_without_intercept)

    # Plot scatter and regression line with intercept
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.scatter(X_median, y_values, alpha=0.5)
    plt.plot(X_median, y_pred_with_intercept, color='red', linewidth=2)
    plt.title(f'{parameter} with Intercept\ny = {model_with_intercept.coef_[0]:.2f}x + {model_with_intercept.intercept_:.2f} | R² = {r2_with_intercept:.2f}')
    plt.xlabel('Median')
    plt.ylabel(parameter)
    plt.grid(True)

    # Plot scatter and regression line without intercept
    plt.subplot(1, 2, 2)
    plt.scatter(X_median, y_values, alpha=0.5)
    plt.plot(X_median, y_pred_without_intercept, color='green', linewidth=2)
    plt.title(f'{parameter} without Intercept\ny = {model_without_intercept.coef_[0]:.2f}x | R² = {r2_without_intercept:.2f}')
    plt.xlabel('Median')
    plt.ylabel(parameter)
    plt.grid(True)

    plt.tight_layout()
    plt.show()

# Call the function for the PM 2.5 parameter
scatter_plot_linear_regression(df_january_20_to_31, 'PM 2.5 (ug/m3)')


In [None]:
# Function to create scatter plots with and without intercept for linear regression, corrected for paired data
def scatter_plot_linear_regression_corrected(df, parameter):
    # Calculate median values for each timestamp
    median_values = df.groupby('PKT')[parameter].median().reset_index(name='Median')
    # Merge the median values with the original dataframe on the timestamp
    df_with_median = pd.merge(df, median_values, on='PKT')
    
    # Prepare data for linear regression
    X_median = df_with_median['Median'].values.reshape(-1, 1)
    y_values = df_with_median[parameter].values

    # Perform linear regression with intercept
    model_with_intercept = LinearRegression(fit_intercept=True)
    model_with_intercept.fit(X_median, y_values)
    y_pred_with_intercept = model_with_intercept.predict(X_median)
    r2_with_intercept = r2_score(y_values, y_pred_with_intercept)

    # Perform linear regression without intercept
    model_without_intercept = LinearRegression(fit_intercept=False)
    model_without_intercept.fit(X_median, y_values)
    y_pred_without_intercept = model_without_intercept.predict(X_median)
    r2_without_intercept = r2_score(y_values, y_pred_without_intercept)

    # Scatter plot and regression line with intercept
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.scatter(X_median, y_values, alpha=0.5)
    plt.plot(X_median, y_pred_with_intercept, color='red', linewidth=2)
    plt.title(f'{parameter} with Intercept\ny = {model_with_intercept.coef_[0]:.2f}x + {model_with_intercept.intercept_:.2f}\nR² = {r2_with_intercept:.2f}')
    plt.xlabel('Median')
    plt.ylabel(parameter)

    # Scatter plot and regression line without intercept
    plt.subplot(1, 2, 2)
    plt.scatter(X_median, y_values, alpha=0.5)
    plt.plot(X_median, y_pred_without_intercept, color='green', linewidth=2)
    plt.title(f'{parameter} without Intercept\ny = {model_without_intercept.coef_[0]:.2f}x\nR² = {r2_without_intercept:.2f}')
    plt.xlabel('Median')
    plt.ylabel(parameter)

    plt.tight_layout()
    plt.show()

# Call the corrected function for the PM 2.5 parameter
scatter_plot_linear_regression_corrected(df_january_20_to_31, 'PM 2.5 (ug/m3)')


In [None]:
# We'll now create scatter plots with and without intercept for linear regression for all parameters
for parameter in parameters:
    print(f"Creating scatter plots for {parameter}...")
    scatter_plot_linear_regression_corrected(df_january_20_to_31, parameter)


In [None]:
# Dictionary to hold regression results for each parameter
regression_results = {
    'Parameter': [],
    'With Intercept Coefficient': [],
    'With Intercept Intercept': [],
    'With Intercept R2': [],
    'Without Intercept Coefficient': [],
    'Without Intercept R2': []
}

# Function to calculate regression results and add to the dictionary
def calculate_regression_results(df, parameter):
    # Calculate median values for each timestamp
    median_values = df.groupby('PKT')[parameter].median().reset_index(name='Median')
    # Merge the median values with the original dataframe on the timestamp
    df_with_median = pd.merge(df, median_values, on='PKT')
    
    # Prepare data for linear regression
    X_median = df_with_median['Median'].values.reshape(-1, 1)
    y_values = df_with_median[parameter].values

    # Linear regression with intercept
    model_with_intercept = LinearRegression(fit_intercept=True)
    model_with_intercept.fit(X_median, y_values)
    r2_with_intercept = r2_score(y_values, model_with_intercept.predict(X_median))

    # Linear regression without intercept
    model_without_intercept = LinearRegression(fit_intercept=False)
    model_without_intercept.fit(X_median, y_values)
    r2_without_intercept = r2_score(y_values, model_without_intercept.predict(X_median))

    # Add results to the dictionary
    regression_results['Parameter'].append(parameter)
    regression_results['With Intercept Coefficient'].append(model_with_intercept.coef_[0])
    regression_results['With Intercept Intercept'].append(model_with_intercept.intercept_)
    regression_results['With Intercept R2'].append(r2_with_intercept)
    regression_results['Without Intercept Coefficient'].append(model_without_intercept.coef_[0])
    regression_results['Without Intercept R2'].append(r2_without_intercept)

# Calculate regression results for all parameters
for parameter in parameters:
    calculate_regression_results(df_january_20_to_31, parameter)

# Convert the results dictionary to a DataFrame
df_regression_results = pd.DataFrame(regression_results)

# Save the DataFrame to a CSV file
results_csv_path = '/mnt/data/regression_results_summary.csv'
df_regression_results.to_csv(results_csv_path


In [None]:
# Function to sanitize parameter names for use in filenames
def sanitize_filename(name):
    return name.replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_per_")

# Function to create and save CSV for each parameter with median values included, with filename correction
def save_parameter_data_with_median_corrected(df, parameter):
    # Calculate median values for each timestamp
    median_values = df.groupby('PKT')[parameter].median().reset_index(name='Median')
    # Merge the median values with the original dataframe on the timestamp
    df_with_median = pd.merge(df, median_values, on='PKT', how='left')
    # Select only the relevant columns to save
    df_to_save = df_with_median[['PKT', 'Sensor', parameter, 'Median']]
    # Sanitize filename
    filename = sanitize_filename(parameter)
    # Save to CSV
    file_path = f'/mnt/data/{filename}_data_with_median.csv'
    df_to_save.to_csv(file_path, index=False)
    return file_path

# Save the CSV for each parameter with corrected filenames
parameter_csv_paths_corrected = {}

for parameter in parameters:
    parameter_csv_paths_corrected[parameter] = save_parameter_data_with_median_corrected(df_january_20_to_31, parameter)

parameter_csv_paths_corrected


In [None]:
# Function to resample sensor data to hourly for all sensors and save to CSV
def save_hourly_all_sensors_data_with_median(df, parameter):
    # Sanitize filename
    filename = sanitize_filename(parameter)
    
    # Empty DataFrame to store hourly data for all sensors
    hourly_all_sensors_data = pd.DataFrame()
    
    # Process each sensor's data
    for sensor in df['Sensor'].unique():
        # Filter data for the current sensor
        sensor_data = df[df['Sensor'] == sensor]
        # Set the index to the timestamp for resampling
        sensor_data = sensor_data.set_index('PKT')
        # Resample data to hourly, calculating the mean for each hour
        hourly_data = sensor_data[parameter].resample('H').mean().reset_index(name=f'{sensor}_{parameter}')
        # If the hourly DataFrame is empty, initialize it with data from the first sensor
        if hourly_all_sensors_data.empty:
            hourly_all_sensors_data = hourly_data
        else:
            # If not empty, merge with the existing data (this aligns all sensors to the same hourly timestamps)
            hourly_all_sensors_data = pd.merge(hourly_all_sensors_data, hourly_data, on='PKT', how='outer')

    # Calculate hourly median values for the parameter across all sensors
    hourly_median = df.resample('H', on='PKT')[parameter].median().reset_index(name='Median')
    # Merge the hourly median values with the hourly data
    hourly_all_sensors_data_with_median = pd.merge(hourly_all_sensors_data, hourly_median, on='PKT', how='left')
    
    # Save to CSV
    file_path = f'/mnt/data/hourly_all_sensors_{filename}_data_with_median.csv'
    hourly_all_sensors_data_with_median.to_csv(file_path, index=False)
    return file_path

# Save the CSV for each hourly resampled parameter including all sensors
hourly_all_sensors_parameter_csv_paths = {}

for parameter in parameters:
    hourly_all_sensors_parameter_csv_paths[parameter] = save_hourly_all_sensors_data_with_median(df_january_20_to_31, parameter)

hourly_all_sensors_parameter_csv_paths
