# Air Quality

## Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
import math
import os
import glob
import datetime
import re
#import openpyxl
#import xlrd

In [None]:
#Read the station channels
list_of_columns = pd.read_csv("data/nb_air_quality_station_channels.csv")

list_of_columns.head(5)

In [None]:
#Create List of columns

def create_list_of_columns(station_id):
    cols_dynamic =[]
    cols = ["STATION_ID","DATE_TIME"]
    
    total_analytes = len(list_of_columns[list_of_columns["STATION_ID"] == station_id])
    #print(total_analytes)
    for i in range(1,total_analytes+1,1):
        analytes_name = list_of_columns[(list_of_columns["STATION_ID"]==station_id) & (list_of_columns["CHANNEL"] == i)].NAME.values[0] if len(list_of_columns[(list_of_columns["STATION_ID"]==station_id) & (list_of_columns["CHANNEL"] == i)]) > 0 else "Error_loading_"+str(station_id)
        cols_dynamic.append(analytes_name)
        cols_dynamic.append(str(analytes_name)+"_FLAG")
        #print("%s | %s | %s" % (analytes_name,station_id,i))
    
    cols.extend(cols_dynamic)
    return cols



In [None]:
def drop_empty_columns(current_dataframe):
    indexes = []
    for i in range(0,len(current_dataframe.columns),1):
        if(len(current_dataframe[current_dataframe.columns[i]])==current_dataframe[current_dataframe.columns[i]].isna().sum()):
            indexes.append(i)
            
    current_dataframe.drop(indexes,inplace=True, axis=1)
    return current_dataframe

In [None]:
#Read CSV files from a folder

current_directory = str(os.getcwd()) + "\\raw_data\\"
dataframes = []

all_files = []

for path, subdirs, files in os.walk(current_directory):
    for name in files:
        file_name = os.path.join(path, name)
        format_matches = [".lsi", ".csv"]
        exclue_matches = []
        if name not in all_files:
            if any([x in name for x in format_matches]):
                if not any([y in file_name for y in exclue_matches]):
                    try:
                        current_dataframe = pd.read_csv(file_name, low_memory=False, header=None,sep=",")
                        
                        current_dataframe = current_dataframe.groupby([0])
                        keys = current_dataframe.groups.keys()
                        
                        for index, x in enumerate(keys):
                            temp_dataframe = current_dataframe.get_group(x).copy()
                            temp_dataframe = drop_empty_columns(temp_dataframe)
            
                            cols = create_list_of_columns(x)
                            temp_dataframe.columns = cols
                            dataframes.append(temp_dataframe)
                        pass
                    except Exception as e:
                        print("Error reading file: " + file_name)
                        print(e)
                else:
                    print("Files Excluded : " + file_name)
            else:
                print("Non Excel File: " + file_name)
        
        all_files.append(name)
all_files = []


In [None]:
dataframes[0].head(1)

In [None]:
dataframes[1].head(1)

In [None]:
dataframes[3].head(1)

In [None]:
dataframes[4].head(1)

In [None]:
dataframes[4].info()

In [None]:
#Check if dataframes have the same columns

if all([set(dataframes[0].columns) == set(df.columns) for df in dataframes]):
    print('Datasets have the same columns')
else:
    print('Datasets do not have the same columns')


In [None]:
#Find the columns names that have found in some columns but not in others. This way we can create those columns for all the dataframes

columns = []
    
for x in range(0, len(dataframes), 1):
    for y in range(0, len(dataframes), 1):
        for z in range(0, len(dataframes[x].columns), 1):
            #print(str(z) + "||"+ str(len(dataframes[y].columns))+ "||" + str(y))
            if(dataframes[x].columns[z] in dataframes[y].columns):
                pass
            else:
                if (dataframes[x].columns[z] in columns):
                    pass
                else:
                    columns.append(dataframes[x].columns[z])
                
print(columns)

In [None]:
#Combine all the dataframes into one

dataset_nb_air_quality_raw = pd.concat(dataframes)
dataset_nb_air_quality_raw.tail(5)

In [None]:
#Export Combined Dataset to a CSV

dataset_nb_air_quality_raw.to_csv("data/nb_air_quality_raw.csv", sep=',',index=False,encoding='utf-8-sig')

#Shape of row data
dataset_nb_air_quality_raw.shape

In [None]:
#Create a copy of the dataset

nb_air_quality_p1 = dataset_nb_air_quality_raw.copy()
nb_air_quality_p1.head(5)

In [None]:
# Replace -9999 with NaN
nb_air_quality_p1 = nb_air_quality_p1.replace(-9999,np.nan)

In [None]:
def find_datetime_format(dt_str):
    formats_to_check = [
        '%Y/%m/%d %I:%M:%S %p',
        '%Y-%m-%d %I:%M:%S %p',
        '%Y/%m/%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S',
        '%d/%m/%Y %I:%M:%S %p',
        '%d-%m-%Y %I:%M:%S %p',
        '%d/%m/%Y %H:%M:%S',
        '%d-%m-%Y %H:%M:%S',
        '%Y/%m/%d',
        '%Y-%m-%d',
        '%d/%m/%Y',
        '%d-%m-%Y',
    ]

    for fmt in formats_to_check:
        try:
            datetime.datetime.strptime(dt_str, fmt)
            return fmt
        except ValueError:
            pass

    return None

# Example usage
dt_str = "2019/01/01 5:00:00 AM"
format_found = find_datetime_format(dt_str)
if format_found:
    print(f"Format found: {format_found}")
else:
    print("Format not found")


In [None]:
#nb_air_quality_p1["DATE_TIME"] = nb_air_quality["DATE_TIME"].replace('24:00','00:00' , regex=True)

nb_air_quality_p1["DATE_TIME"] = pd.to_datetime(nb_air_quality_p1["DATE_TIME"], format='%Y/%m/%d %I:%M:%S %p')

nb_air_quality_p1["DATE"] = nb_air_quality_p1["DATE_TIME"].dt.date
nb_air_quality_p1["YEAR"] = nb_air_quality_p1["DATE_TIME"].dt.year

In [None]:
nb_air_quality_p1.head(5)

#### Station information

In [None]:
#Import data

station_information = pd.read_csv("data/nb_air_quality_station_info.csv")

nb_air_quality_p1 = pd.merge(nb_air_quality_p1, station_information[["STATION_ID","STATION_NAME","LATITUDE","LONGITUDE","STATION_NAPS_ID","ORGANIZATION","CITY"]],  how='left', left_on=['STATION_ID'], right_on = ['STATION_ID'])

nb_air_quality_p1.head(5)

#### Units Information

In [None]:
def add_unit_column(unitName,station_id):
    channel_series = list_of_columns[(list_of_columns["STATION_ID"]==station_id) & (list_of_columns["NAME"] == unitName)]
    station_id_channel_series = channel_series["STATION_ID"]
    station_id_channel = station_id_channel_series.values[0] if len(station_id_channel_series) > 0 else np.nan
    
    if(math.isnan(station_id_channel) == False and station_id_channel==station_id):
        analyte_unit = channel_series["UNITS_EUG"]
        analyte_avg = channel_series["AVERAGE_TYPE"]
        analyte_num_format = channel_series["NUMERIC_FORMAT"]
        analyte_low = channel_series["LOW_RANGE"]
        analyte_high = channel_series["HIGH_RANGE"]
        analyte_state = channel_series["STATE"]
        
        return "Unit: " + str(analyte_unit.values[0]) + ", Average Type: " + str(analyte_avg.values[0]) + ", Number Format: " + str(analyte_num_format.values[0]) + ", Low Range: " + str(analyte_low.values[0]) + ", High Range: " + str(analyte_high.values[0]) + ", Station State: " + str(analyte_state.values[0])  if len(analyte_unit) > 0 else np.nan

In [None]:
dataframes_with_info = []

nb_air_quality_group = nb_air_quality_p1.groupby(["STATION_ID"])

keys = nb_air_quality_group.groups.keys()

for index, x in enumerate(keys):
    temp_dataframe = nb_air_quality_group.get_group(x).copy()
    
    temp_dataframe["SO2_INFO"] = add_unit_column("SO2",x)
    temp_dataframe["O3_INFO"] = add_unit_column("O3",x)
    temp_dataframe["CO_INFO"] = add_unit_column("CO",x)
    temp_dataframe["TRS_INFO"] = add_unit_column("TRS",x)
    temp_dataframe["NO2_INFO"] = add_unit_column("NO2",x)
    temp_dataframe["NO_INFO"] = add_unit_column("NO",x)
    temp_dataframe["NOX_INFO"] = add_unit_column("NOX",x)
    #temp_dataframe["H2S_INFO"] = add_unit_column("H2S",x)
    #temp_dataframe["AQHI_INFO"] = add_unit_column("AQHI",x)
    #temp_dataframe["AQI_INFO"] = add_unit_column("AQI",x)
    temp_dataframe["PM_25_BAM_INFO"] = add_unit_column("PM25 BAM",x)
    temp_dataframe["PM_25_API_INFO"] = add_unit_column("PM_2.5API",x)
    #temp_dataframe["PM_10_API_INFO"] = add_unit_column("PM_10API",x)
    #temp_dataframe["PM_25_TEOM_INFO"] = add_unit_column("PM25 TEOM",x)
    temp_dataframe["WIND_DIR_INFO"] = add_unit_column("Wind Dir",x)
    temp_dataframe["WIND_SPEED_INFO"] = add_unit_column("Wind Speed",x)
    dataframes_with_info.append(temp_dataframe)

In [None]:
dataframes_with_info[1].head(5)

In [None]:

nb_air_quality_p1 = pd.concat(dataframes_with_info)

In [None]:
nb_air_quality_p1['STATION_ID'] = nb_air_quality_p1['STATION_ID'].astype(str)
#m = nb_air_quality_p1.STATION_ID.str.len().max()
nb_air_quality_p1['STATION_ID'] = nb_air_quality_p1['STATION_ID'].str.rjust(3,"0")

In [None]:
nb_air_quality_p1.shape

In [None]:
#Check for null values
pd.set_option('display.max_rows',None)
nb_air_quality_p1.isna().sum()

In [None]:
pd.reset_option('display.max_rows')

#### Create another copy of the dataset for futher pre-processing

Some methods are slow when processing data. Creating a copy of a dataset will allow us not to run the entire code during data development. 

In [None]:
nb_air_quality_p2 = nb_air_quality_p1.copy()

#### Check and remove null columns

In [None]:
def drop_empty_columns_dataset(dataset):
    indexes = []
    for i in range(0,len(dataset.columns),1):
        if(len(dataset[dataset.columns[i]])==dataset[dataset.columns[i]].isna().sum()):
            indexes.append(dataset.columns[i])
            print(dataset.columns[i])
   
    dataset.drop(indexes,inplace=True, axis=1)
    return dataset

In [None]:
nb_air_quality_p2 = drop_empty_columns_dataset(nb_air_quality_p2) #finish this later when we have whole data

In [None]:
nb_air_quality_p2.columns

In [None]:
cols = ["SO2", "O3", "CO", "TRS", "NO2", "NO", "NOX", "PM25 BAM", "PM_2.5API", "Wind Dir", "Wind Speed"]

nb_air_quality_p2[cols] = nb_air_quality_p2[cols].astype(str)

##### Remove unit information field value where there is no analyte value

In [None]:
#Import flag data 

status_flags = pd.read_csv("data/nb_air_quality_status_code.csv")

get_flag = status_flags[(status_flags["DATA_STATUS_VALID"]==1)]
match_status_list = get_flag["STATUS_CODE"].tolist()

print(match_status_list)

In [None]:
def remove_unit_when_flag(unitVal, flagVal):
    
    #print(unitVal)
    if flagVal in match_status_list:
        return unitVal

    return "("+str(unitVal)+")" if pd.isnull(unitVal)==False else np.nan

remove_unit_when_flag_vec = np.vectorize(remove_unit_when_flag)

In [None]:
nb_air_quality_p2["SO2"] = remove_unit_when_flag_vec(nb_air_quality_p2["SO2"],nb_air_quality_p2["SO2_FLAG"])
nb_air_quality_p2["O3"] = remove_unit_when_flag_vec(nb_air_quality_p2["O3"],nb_air_quality_p2["O3_FLAG"])
nb_air_quality_p2["CO"] = remove_unit_when_flag_vec(nb_air_quality_p2["CO"],nb_air_quality_p2["CO_FLAG"])
nb_air_quality_p2["TRS"] = remove_unit_when_flag_vec(nb_air_quality_p2["TRS"],nb_air_quality_p2["TRS_FLAG"])
nb_air_quality_p2["NO2"] = remove_unit_when_flag_vec(nb_air_quality_p2["NO2"],nb_air_quality_p2["NO2_FLAG"])
nb_air_quality_p2["NO"] = remove_unit_when_flag_vec(nb_air_quality_p2["NO"],nb_air_quality_p2["NO_FLAG"])
nb_air_quality_p2["NOX"] = remove_unit_when_flag_vec(nb_air_quality_p2["NOX"],nb_air_quality_p2["NOX_FLAG"])
nb_air_quality_p2["PM25 BAM"] = remove_unit_when_flag_vec(nb_air_quality_p2["PM25 BAM"],nb_air_quality_p2["PM25 BAM_FLAG"])
nb_air_quality_p2["PM_2.5API"] = remove_unit_when_flag_vec(nb_air_quality_p2["PM_2.5API"],nb_air_quality_p2["PM_2.5API_FLAG"])
nb_air_quality_p2["Wind Dir"] = remove_unit_when_flag_vec(nb_air_quality_p2["Wind Dir"],nb_air_quality_p2["Wind Dir_FLAG"])
nb_air_quality_p2["Wind Speed"] = remove_unit_when_flag_vec(nb_air_quality_p2["Wind Speed"],nb_air_quality_p2["Wind Speed_FLAG"])

In [None]:
def add_flagged_value_to_info(unit_val,info_val):
    if(str(unit_val).startswith("(")):
        unit_val = re.findall('\((.*?)\)',unit_val)[0]
        
        return info_val + ", Flagged Value: "+ re.sub('\((.*?)\)','',str(unit_val)) if pd.isnull(info_val)==False else info_val
    else:
        return info_val 
    
add_flagged_value_to_info_vec = np.vectorize(add_flagged_value_to_info)

In [None]:
nb_air_quality_p2["SO2_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["SO2"],nb_air_quality_p2["SO2_INFO"])
nb_air_quality_p2["O3_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["O3"],nb_air_quality_p2["O3_INFO"])
nb_air_quality_p2["CO_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["CO"],nb_air_quality_p2["CO_INFO"])
nb_air_quality_p2["TRS_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["TRS"],nb_air_quality_p2["TRS_INFO"])
nb_air_quality_p2["NO2_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["NO2"],nb_air_quality_p2["NO2_INFO"])
nb_air_quality_p2["NO_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["NO"],nb_air_quality_p2["NO_INFO"])
nb_air_quality_p2["NOX_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["NOX"],nb_air_quality_p2["NOX_INFO"])
nb_air_quality_p2["PM_25_BAM_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["PM25 BAM"],nb_air_quality_p2["PM_25_BAM_INFO"])
nb_air_quality_p2["PM_25_API_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["PM_2.5API"],nb_air_quality_p2["PM_25_API_INFO"])
nb_air_quality_p2["WIND_DIR_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["Wind Dir"],nb_air_quality_p2["WIND_DIR_INFO"])
nb_air_quality_p2["WIND_SPEED_INFO"] = add_flagged_value_to_info_vec(nb_air_quality_p2["Wind Speed"],nb_air_quality_p2["WIND_SPEED_INFO"])

In [None]:
def remove_flagged_values(unit_val):
    if(str(unit_val).startswith("(")):
        return np.nan
    
    return unit_val

remove_flagged_values_vec = np.vectorize(remove_flagged_values)

In [None]:
#vec_re = np.vectorize(remove_flagged_values)
#nb_air_quality_p2["SO2"] = vec_re(nb_air_quality_p2["SO2"])

nb_air_quality_p2["SO2"] = remove_flagged_values_vec(nb_air_quality_p2["SO2"])
nb_air_quality_p2["O3"] = remove_flagged_values_vec(nb_air_quality_p2["O3"])
nb_air_quality_p2["CO"] = remove_flagged_values_vec(nb_air_quality_p2["CO"])
nb_air_quality_p2["TRS"] = remove_flagged_values_vec(nb_air_quality_p2["TRS"])
nb_air_quality_p2["NO2"] = remove_flagged_values_vec(nb_air_quality_p2["NO2"])
nb_air_quality_p2["NO"] = remove_flagged_values_vec(nb_air_quality_p2["NO"])
nb_air_quality_p2["NOX"] = remove_flagged_values_vec(nb_air_quality_p2["NOX"])
nb_air_quality_p2["PM25 BAM"] = remove_flagged_values_vec(nb_air_quality_p2["PM25 BAM"])
nb_air_quality_p2["PM_2.5API"] = remove_flagged_values_vec(nb_air_quality_p2["PM_2.5API"])
nb_air_quality_p2["Wind Dir"] = remove_flagged_values_vec(nb_air_quality_p2["Wind Dir"])
nb_air_quality_p2["Wind Speed"] = remove_flagged_values_vec(nb_air_quality_p2["Wind Speed"])

In [None]:
nb_air_quality_p3 = nb_air_quality_p2.copy()

In [None]:
cols = ["SO2", "O3", "CO", "TRS", "NO2", "NO", "NOX", "PM25 BAM", "PM_2.5API", "Wind Dir", "Wind Speed"]

nb_air_quality_p3[cols] = nb_air_quality_p3[cols].astype(float)

In [None]:
def select_one_variale(var_secondary, var_primary,var_secondary_source,var_primary_source):
    if((var_primary == "" or math.isnan(var_primary)) and (var_secondary == "" or math.isnan(var_secondary))):
        #print("Both")
        return ""
    if(var_primary == "" or math.isnan(var_primary)):
        #print(var_secondary)
        return str(var_secondary) + "("+var_secondary_source +")"
    
    return str(var_primary) + "("+var_primary_source +")"

select_one_variale_vec = np.vectorize(select_one_variale)

In [None]:
nb_air_quality_p3["PM_25"] = select_one_variale_vec(nb_air_quality_p3["PM25 BAM"],nb_air_quality_p3["PM_2.5API"],"PM_25_BAM","PM_25_API")

In [None]:
#Separate source columns

def separate_source_columns(raw_value):
    #print(raw_value)
    if(pd.isnull(raw_value)==False or math.isnan(raw_value)==False):
        #print(raw_value)
        source_only = re.findall('\((.*?)\)',raw_value)
        source_only = source_only[0] if len(source_only) > 0 else ""
        #print(source_only)
        return source_only
    else:
        return "" 
        
separate_source_columns_vec = np.vectorize(separate_source_columns)

In [None]:
nb_air_quality_p3["PM_25_SOURCE"] = separate_source_columns_vec(nb_air_quality_p3["PM_25"])
#nb_air_quality_p3["PM_25_SOURCE"] = ""

In [None]:
#Remove source data from analytes

def remove_source_name(raw_value):
    if(pd.isnull(raw_value)==False):
        #print(raw_value)
        value_cleaned = re.sub('\((.*?)\)','',raw_value)
        #print(value_cleaned)
        return value_cleaned
    else:
        return ""  
        
remove_source_name_vec = np.vectorize(remove_source_name)

In [None]:
nb_air_quality_p3["PM_25"] = remove_source_name_vec(nb_air_quality_p3["PM_25"])

In [None]:
#Create PM flag columns

def create_one_flag(flag_source,flag_source_to_test,flag_primary,flag_secondary):
    if(flag_source==flag_source_to_test):
        return flag_primary
    return flag_secondary
create_one_flag_vec = np.vectorize(create_one_flag)

In [None]:
nb_air_quality_p3["PM_25_FLAG"] = create_one_flag_vec(nb_air_quality_p3["PM_25_SOURCE"],"PM_25_API",nb_air_quality_p3["PM_2.5API_FLAG"],nb_air_quality_p3["PM25 BAM_FLAG"])

In [None]:
nb_air_quality_p3.columns

In [None]:
nb_air_quality_p4 = nb_air_quality_p3.copy()

In [None]:
#Rename columns to remove whitespace
nb_air_quality_p4.rename(columns={'PM25 BAM': 'PM_25_BAM', 'PM25 BAM_FLAG': 'PM_25_BAM_FLAG', 'PM_2.5API': 'PM_25_API', 'PM_2.5API_FLAG': 'PM_25_API_FLAG', 'Wind Dir': 'WIND_DIR', 'Wind Dir_FLAG': 'WIND_DIR_FLAG', 'Wind Speed': 'WIND_SPEED', 'Wind Speed_FLAG': 'WIND_SPEED_FLAG'}, inplace=True)

In [None]:
#Manually drop `Flag` columns that doesnt have realted analyte information 

nb_air_quality_p4 = nb_air_quality_p4.drop(['AQHI_FLAG','AQI_FLAG','StDv_FLAG', 'PM25 TEOM_FLAG', 'PM10 TEOM_FLAG', 'GRIMM PM 1.0_FLAG', 'GRIMM PM 2.5_FLAG', 'GRIMM PM 10_FLAG', 'TEMP_FLAG', 'Rain_FLAG', 'Bar Press_FLAG', 'RH_FLAG', 'PM_10API_FLAG', 'TRS_API_FLAG', 'SO2_Run24hr_FLAG', 'H2S_FLAG', 'Battery_FLAG', 'Rain Intensity_FLAG', 'Rain Duration_FLAG', 'Wind Dir V_FLAG', 'WD_LS_FLAG', 'WS_LS_FLAG'], axis=1)

In [None]:
nb_air_quality_p4.columns

In [None]:
# Flag columns data type to int

cols = ["SO2_FLAG", "O3_FLAG", "NO2_FLAG", "NO_FLAG", "NOX_FLAG", "PM_25_BAM_FLAG", "PM_25_API_FLAG", "TRS_FLAG", "CO_FLAG", "WIND_DIR_FLAG", "WIND_SPEED_FLAG"]

nb_air_quality_p4[cols] = nb_air_quality_p4[cols].fillna(-1)
nb_air_quality_p4[cols] = nb_air_quality_p4[cols].astype('int32')
nb_air_quality_p4[cols] = nb_air_quality_p4[cols].replace(-1, np.nan, regex=False)

In [None]:
#Empty String to NaN

cols = ['PM_25','PM_25_SOURCE']
nb_air_quality_p4[cols] = nb_air_quality_p4[cols].replace("",np.nan)

nb_air_quality_p4['PM_25'] = nb_air_quality_p4['PM_25'].astype(float)

In [None]:
#Round the Coulmns to 1 decimal point

#cols = ['SO2', 'O3','NO2','NO', 'NOX','PM_25_BAM','TRS', 'WIND_DIR','WIND_SPEED', 'PM_25_API', 'CO']

#nb_air_quality_p4[cols] = nb_air_quality_p4[cols].round(1)

In [None]:
#Rearrange columns

cols = ['STATION_ID', 'DATE_TIME','DATE', 'YEAR', 'STATION_NAME', 'LATITUDE', 'LONGITUDE', 'STATION_NAPS_ID', 'ORGANIZATION', 'CITY', 'SO2', 'SO2_FLAG', 'O3', 'O3_FLAG', 'NO2', 'NO2_FLAG', 'NO', 'NO_FLAG', 'NOX', 'NOX_FLAG','PM_25','PM_25_FLAG', 'PM_25_SOURCE', 'TRS', 'TRS_FLAG', 'CO', 'CO_FLAG', 'WIND_DIR', 'WIND_DIR_FLAG', 'WIND_SPEED', 'WIND_SPEED_FLAG']

nb_air_quality_p4 = nb_air_quality_p4[cols]

##### Create a final copy of processed data

In [None]:
nb_air_quality = nb_air_quality_p4.copy()

In [None]:
#Export Combined Dataset to a CSV

nb_air_quality.to_csv("data/nb_air_quality.csv", sep=',',index=False,encoding='utf-8-sig')

#Shape of row data
nb_air_quality.shape

In [None]:
nb_air_quality.info()

In [None]:
nb_air_quality.columns

#### Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Null Values 

In [None]:

#Plot null values
def plot_null_values(dataset,group_by):

    dataset = dataset.groupby([group_by])
    # extract keys from groups
    keys = dataset.groups.keys()

    totalCols=3
    totalRows=math.ceil(len(dataset)/totalCols)
    
    fig = plt.figure(figsize=((totalCols+3)*3,(totalRows+1)*5))
    plt.subplots_adjust(wspace=0.2, hspace=0.7)

    for index, x in enumerate(keys):
        null_columns = []
        null_column_values = []

        #print(dataset.get_group(x).columns[2])

        for i in range(0,len(dataset.get_group(x).columns),1):
            if(dataset[dataset.get_group(x).columns[i]].get_group(x).isna().sum() > 0):
                null_columns.append(dataset.get_group(x).columns[i])
                null_column_values.append(dataset[dataset.get_group(x).columns[i]].get_group(x).isna().sum())

        globals()[f"ax_count_plots_{index}"] = fig.add_subplot(totalRows,totalCols,(index+1))
        globals()[f"ax_count_plots_{index}"].set_title(x.upper())
        

        globals()[f"ax_count_plots_{index}"].bar(null_columns,null_column_values)
        
        globals()[f"ax_count_plots_{index}"].set(xlabel=None)
        globals()[f"ax_count_plots_{index}"].tick_params(axis='x', labelrotation = 90)
        
    plt.show()


In [None]:
plot_null_values(nb_air_quality,'STATION_NAME')

#### Analyse variables 

In [None]:

#This is just an example of a line graph, visualization can be better
def linechart_of_categories(dataset,group_by,time_column,value_column):

    dataset = dataset.set_index(time_column)
    dataset = dataset.groupby([group_by])
    # extract keys from groups
    keys = dataset.groups.keys()
    totalRows = 0
    for index, x in enumerate(keys):
        if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            totalRows+=1
    
    totalCols=3
    totalRows=math.ceil(totalRows/totalCols)
    
    fig = plt.figure(figsize=((totalCols+3)*3,(totalRows+1)*5))
    plt.subplots_adjust(wspace=0.2, hspace=0.6)
    newInx = 1
    for index, x in enumerate(keys):
        if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            globals()[f"ax_count_plots_{index}"] = fig.add_subplot(totalRows,totalCols,newInx)
            globals()[f"ax_count_plots_{index}"].set_title(x.upper())
            #if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            dataset[value_column].get_group(x).plot()
            
            globals()[f"ax_count_plots_{index}"].set(xlabel=None)
            globals()[f"ax_count_plots_{index}"].tick_params(axis='x', labelrotation = 90)
            newInx+=1
    plt.show()


In [None]:
linechart_of_categories(nb_air_quality,'STATION_NAME','DATE','PM_25')

In [None]:
#Display unique values

def unique_values__or_count(listOfColumns,options,dataset):
    for x in range(0, len(listOfColumns), 1):
        if(options=="unique"):
            unique_values_str = dataset[listOfColumns[x]].unique()
            print("unique_values " + listOfColumns[x])
            print(unique_values_str)
            print("------------------------")
        if(options=="count"):
            values_distribution = dataset[listOfColumns[x]].value_counts()
            print("-----------"+listOfColumns[x] +"------------")
            print(values_distribution)
            print("-----------------------")

In [None]:
unique_values__or_count(['SO2_FLAG', 'O3_FLAG', 'NO2_FLAG', 'NO_FLAG', 'NOX_FLAG', 'PM_25_FLAG', 'TRS_FLAG', 'CO_FLAG', 'WIND_DIR_FLAG', 'WIND_SPEED_FLAG'],"count",nb_air_quality)

#83,85,89,78,86,27,77,22

#### Data Validation

In [None]:
#Import cleaned data 

nb_air_quality_validate = pd.read_csv("data/nb_air_quality.csv", low_memory=False)

In [None]:
Col_name_to_validate = "NO2"
flag_name = Col_name_to_validate+"_FLAG"

In [None]:
nb_air_quality_validate["DATE_TIME"] = pd.to_datetime(nb_air_quality_validate["DATE_TIME"],infer_datetime_format=True, format='%Y-%m-%d %H:%M:%S')
nb_acid_rain_validate = nb_air_quality_validate[["STATION_ID", "DATE_TIME", Col_name_to_validate,flag_name]].copy()

nb_acid_rain_validate.head(5)

In [None]:
#Import original data

nb_air_quality_original = pd.read_csv("raw_data/28_04_2023 09_43132.lsi", low_memory=False, header=None,sep=",")


In [None]:
nb_air_quality_original.columns

In [None]:
Col_index_in_original = 6

#Rename columns to remove whitespace
nb_air_quality_original.rename(columns={0: 'STATION_ID', 1: 'DATE_TIME', Col_index_in_original: Col_name_to_validate, Col_index_in_original+1: flag_name}, inplace=True)

In [None]:
def remove_braces(unit_val):
    if(str(unit_val).startswith("(")):
        unit_val = re.findall('\((.*?)\)',unit_val)[0]
        return unit_val
    
    return unit_val 

In [None]:
# Replace -9999 with NaN
nb_air_quality_original = nb_air_quality_original.replace(-9999,np.nan)

#Change date format
nb_air_quality_original["DATE_TIME"] = pd.to_datetime(nb_air_quality_original["DATE_TIME"],infer_datetime_format=True, format='%Y-%m-%d %H:%M:%S')

#Round to 1 decimal
nb_air_quality_original[Col_name_to_validate] = nb_air_quality_original[Col_name_to_validate].round(1)

#Trim data to validate an analyte
nb_air_quality_original = nb_air_quality_original[["STATION_ID", "DATE_TIME", Col_name_to_validate,flag_name]].copy()

#Remove flagged
nb_air_quality_original[Col_name_to_validate] = nb_air_quality_original.apply(lambda x: remove_unit_when_flag(x[Col_name_to_validate],x[flag_name]),axis=1)

#Remove flagged values
nb_air_quality_original[Col_name_to_validate] = nb_air_quality_original.apply(lambda x: remove_flagged_values(x[Col_name_to_validate]),axis=1)

In [None]:
nb_air_quality_original.head(5)

In [None]:
nb_air_quality_validate_results = pd.merge(nb_acid_rain_validate, nb_air_quality_original, on=["STATION_ID", "DATE_TIME",Col_name_to_validate,flag_name], how='right', indicator='Exist')
nb_air_quality_validate_results['Exist'] = np.where(nb_air_quality_validate_results.Exist == 'both', True, False)

In [None]:
unique_values__or_count(['Exist'],"count",nb_air_quality_validate_results)

In [None]:
list_of_missing_rows = nb_air_quality_validate_results[(nb_air_quality_validate_results['Exist'] == False)].copy()

list_of_missing_rows.head(5)