# Recreational Beach Monitoring Program

## Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
import math
import os
import glob
import datetime
import re
#import openpyxl
#import xlrd

In [None]:
#Read CSV files from a folder

current_directory = str(os.getcwd()) + "\\raw_data\\"
dataframes = []

all_files = []

for path, subdirs, files in os.walk(current_directory):
    for name in files:
        file_name = os.path.join(path, name)
        format_matches = [".csv"]
        exclue_matches = []
        if name not in all_files:
            if any([x in name for x in format_matches]):
                if not any([y in file_name for y in exclue_matches]):
                    try:
                        current_dataframe = pd.read_csv(file_name, low_memory=False,sep=",")
                        dataframes.append(current_dataframe)

                        pass
                    except Exception as e:
                        print("Error reading file: " + file_name)
                        print(e)
                else:
                    print("Files Excluded : " + file_name)
            else:
                print("Non Excel File: " + file_name)
        
        all_files.append(name)
all_files = []


In [None]:
dataframes[0].head(1)

In [None]:
#Check if dataframes have the same columns

if all([set(dataframes[0].columns) == set(df.columns) for df in dataframes]):
    print('Datasets have the same columns')
else:
    print('Datasets do not have the same columns')



In [None]:
#Find the columns names that have found in some columns but not in others. This way we can create those columns for all the dataframes

columns = []
    
for x in range(0, len(dataframes), 1):
    for y in range(0, len(dataframes), 1):
        for z in range(0, len(dataframes[x].columns), 1):
            #print(str(z) + "||"+ str(len(dataframes[y].columns))+ "||" + str(y))
            if(dataframes[x].columns[z] in dataframes[y].columns):
                pass
            else:
                if (dataframes[x].columns[z] in columns):
                    pass
                else:
                    columns.append(dataframes[x].columns[z])
                
print(columns)

In [None]:
#Combine all the dataframes into one

recreational_beach_monitoring_raw = pd.concat(dataframes)
recreational_beach_monitoring_raw.tail(5)

In [None]:
#Export Combined Dataset to a CSV

recreational_beach_monitoring_raw.to_csv("data/recreational_beach_monitoring_raw.csv", sep=',',index=False,encoding='utf-8-sig')

#Shape of row data
recreational_beach_monitoring_raw.shape

In [None]:
#Create a copy of the dataset

recreational_beach_monitoring_p1 = recreational_beach_monitoring_raw.copy()
recreational_beach_monitoring_p1.head(5)

In [None]:
# Replace -9999 with NaN
#nb_air_quality_p1 = nb_air_quality_p1.replace(-9999,np.nan)

In [None]:
def find_datetime_format(dt_str):
    formats_to_check = [
        '%Y/%m/%d %I:%M:%S %p',
        '%Y-%m-%d %I:%M:%S %p',
        '%Y/%m/%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S',
        '%d/%m/%Y %I:%M:%S %p',
        '%d-%m-%Y %I:%M:%S %p',
        '%d/%m/%Y %H:%M:%S',
        '%d-%m-%Y %H:%M:%S',
        '%Y/%m/%d',
        '%Y-%m-%d',
        '%d/%m/%Y',
        '%d-%m-%Y',
    ]

    for fmt in formats_to_check:
        try:
            datetime.datetime.strptime(dt_str, fmt)
            return fmt
        except ValueError:
            pass

    return None

# Example usage
dt_str = "2004/03/04"
format_found = find_datetime_format(dt_str)
if format_found:
    print(f"Format found: {format_found}")
else:
    print("Format not found")


In [None]:
#nb_air_quality_p1["DATE_TIME"] = nb_air_quality["DATE_TIME"].replace('24:00','00:00' , regex=True)

recreational_beach_monitoring_p1["FromDate"] = pd.to_datetime(recreational_beach_monitoring_p1["FromDate"],format='%Y/%m/%d')

recreational_beach_monitoring_p1["YEAR"] = recreational_beach_monitoring_p1["FromDate"].dt.year

In [None]:
recreational_beach_monitoring_p1.head(5)

#### Station information

In [None]:
#Rename two columns before station informtion
recreational_beach_monitoring_p1.rename(columns={'Station': 'STATION_NAME', 'FromDate': 'DATE'}, inplace=True)

#Import data 

station_information = pd.read_csv("data/recreational-beach-monitoring-stations-sites.csv")

In [None]:
# Convert DMS (degrees, minutes, seconds) to DD (decimal degrees)
def dms2dd(degrees, minutes, seconds, direction):
    dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)
    if direction == 'S' or direction == 'W':
        dd *= -1
    return dd

def dd2dms(dms,dd, pre_fix_latlong):
    if(pd.isnull(dms) == True and pd.isnull(dd) == False):
        d = int(dd)
        md = abs(dd - d) * 60
        m = int(md)
        sd = (md - m) * 60
        #return [d, m, sd] 
        #print("%s %s˚ %s' %s\"" % (pre_fix_latlong,abs(d),m,round(sd,1)))
        return "%s %s˚ %s' %s\"" % (pre_fix_latlong,abs(d),m,round(sd,1))
    return dms

def parse_dms(dms,latlong):
    if(pd.isnull(dms) != True):
        #print(dms)
        dms=dms.replace('"','')
        degDirection, minutes, seconds = re.split('[˚\']', dms)
        direction,deg = re.split('[\s]', degDirection)
        #print(deg, minutes, seconds, direction)
        latLng = dms2dd(deg, minutes, seconds, direction)

        return (latLng)
    else:
        return latlong

#dd = parse_dms("36°57'9' N 110°4'21' W")

#print(parse_dms("W 67˚ 44' 01.3",np.nan))


In [None]:
station_information["LATITUDE"] = station_information.apply(lambda x: parse_dms(x["DMS_LATITUDE"],x["LATITUDE"]),axis=1)
station_information["LONGITUDE"] = station_information.apply(lambda x: parse_dms(x["DMS_LONGITUDE"],x["LONGITUDE"]),axis=1)

station_information["DMS_LATITUDE"] = station_information.apply(lambda x: dd2dms(x["DMS_LATITUDE"],x["LATITUDE"],"N"),axis=1)
station_information["DMS_LONGITUDE"] = station_information.apply(lambda x: dd2dms(x["DMS_LONGITUDE"],x["LONGITUDE"],"W"),axis=1)

#Export stations to a CSV

station_information.to_csv("data/recreational-beach-monitoring-stations-sites.csv", sep=',',index=False,encoding='utf-8-sig')

In [None]:
#Attach station information

recreational_beach_monitoring_p1 = pd.merge(recreational_beach_monitoring_p1, station_information[["STATION_ID","STATION_NAME","LATITUDE","LONGITUDE"]],  how='left', left_on=['STATION_NAME'], right_on = ['STATION_NAME'])

recreational_beach_monitoring_p1.head(5)

In [None]:
# Replace -9999 with NaN
#nb_surface_water_monitoring_p1 = nb_surface_water_monitoring_p1.replace(-9999,np.nan)

In [None]:
#Check for null values
pd.set_option('display.max_rows',None)
#pd.set_option('display.max_columns', None)
recreational_beach_monitoring_p1.isna().sum()

In [None]:
pd.reset_option('display.max_rows')
#pd.reset_option('display.max_columns')

#### Create another copy of the dataset for futher pre-processing

Some methods are slow when processing data. Creating a copy of a dataset will allow us not to run the entire code during data development. 

In [None]:
recreational_beach_monitoring_p2 = recreational_beach_monitoring_p1.copy()

#### Check and remove null columns

In [None]:
def drop_empty_columns_dataset(dataset):
    indexes = []
    for i in range(0,len(dataset.columns),1):
        if(len(dataset[dataset.columns[i]])==dataset[dataset.columns[i]].isna().sum()):
            indexes.append(dataset.columns[i])
            print(dataset.columns[i])
   
    dataset.drop(indexes,inplace=True, axis=1)
    return dataset

In [None]:
recreational_beach_monitoring_p2 = drop_empty_columns_dataset(recreational_beach_monitoring_p2)

In [None]:
recreational_beach_monitoring_p2.columns.tolist()

##### Remove unit information field value where there is no analyte value

In [None]:
def remove_unit_from_empty(unitVal, unitName):
    if(unitVal == "" or math.isnan(unitVal)):
        return np.nan
    return unitName


In [None]:
""" nb_surface_water_monitoring_p2["SO2_INFO"] = nb_surface_water_monitoring_p2.apply(lambda x: remove_unit_from_empty(x["SO2"],x["SO2_INFO"]),axis=1) """

In [None]:
#recreational_beach_monitoring_p2.columns.tolist()

In [None]:
def clean_column_names(column_name):
    unit_only = re.findall('\((.*?)\)',column_name)
    unit_only = unit_only[0] if len(unit_only) > 0 else unit_only
    column_name_cleaned = re.sub('\((.*?)\)','',column_name)
    column_name_cleaned = column_name_cleaned.replace(" - ", "-")
    column_name_cleaned = column_name_cleaned.replace("  ", "_")
    column_name_cleaned = column_name_cleaned.replace(" ", "_")
    column_name_cleaned = column_name_cleaned.replace("-", "_")
    column_name_cleaned = column_name_cleaned.replace(".", "_")
    column_name_cleaned = column_name_cleaned.replace(",", "_")
    column_name_cleaned = column_name_cleaned.upper()
    return [column_name_cleaned, unit_only]

In [None]:
# Get units from columns and store in a dataframe

recreational_beach_monitoring_units = pd.DataFrame(columns = ['UNIT_NAME_ORIGINAL','UNIT_NAME_CLEANED', 'UNIT'])

non_unit_columns = ['STATION_NAME', 'DATE', 'Subprg', 'Field Number', 'Medium Code', 'Medium Desc', 'YEAR', 'LATITUDE', 'LONGITUDE','Flag','STATION_ID']
# append rows to an empty DataFrame
for i in range(0,len(recreational_beach_monitoring_p2.columns),1):
    if not any([y in recreational_beach_monitoring_p2.columns[i] for y in non_unit_columns]):
        #print(clean_column_names(recreational_beach_monitoring_p2.columns[i]))
        recreational_beach_monitoring_units_row = pd.DataFrame({'UNIT_NAME_ORIGINAL':recreational_beach_monitoring_p2.columns[i],'UNIT_NAME_CLEANED':clean_column_names(recreational_beach_monitoring_p2.columns[i])[0], 'UNIT':clean_column_names(recreational_beach_monitoring_p2.columns[i])[1]}, index=[i])
        recreational_beach_monitoring_units = pd.concat([recreational_beach_monitoring_units, recreational_beach_monitoring_units_row])

recreational_beach_monitoring_units["UNIT_NAME_CLEANED"] = recreational_beach_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_=TDS_RPC_LAB","TDS_RPC_LAB_CALC")

#Export Combined Dataset to a CSV

recreational_beach_monitoring_units.to_csv("data/recreational-beach-monitoring-units.csv", sep=',',index=False,encoding='utf-8-sig')

recreational_beach_monitoring_units.head(5)

In [None]:
#Rename all columns

recreational_beach_monitoring_p2 = recreational_beach_monitoring_p2.rename(columns=lambda x: clean_column_names(x)[0])

In [None]:
#Manually rename calculated variables

recreational_beach_monitoring_p2 = recreational_beach_monitoring_p2.rename(columns={"Þ_=TDS_RPC_LAB":"TDS_RPC_LAB_CALC"})
 

In [None]:
#Manually drop columns 

recreational_beach_monitoring_p2 = recreational_beach_monitoring_p2.drop(['SUBPRG','MEDIUM_CODE','MEDIUM_DESC'], axis=1)


In [None]:
recreational_beach_monitoring_p2.columns.tolist()

In [None]:
#Round the Coulmns to 1 decimal point

#cols = ['AL_ENV_LAB', 'ALK_G_ENV_LAB']

#recreational_beach_monitoring_p2[cols] = recreational_beach_monitoring_p2[cols].round(2)

##### Merge multiple source data 

In [None]:
recreational_beach_monitoring_p3 = recreational_beach_monitoring_p2.copy()

In [None]:
recreational_beach_monitoring_p3.shape

In [None]:
# Drop a row of wrong data as suggested by the Water Sciences team 

recreational_beach_monitoring_p3 = recreational_beach_monitoring_p3.drop(recreational_beach_monitoring_p3[(recreational_beach_monitoring_p3["STATION_NAME"] == "Mount Carleton Provincial Park - Station 1") & (recreational_beach_monitoring_p3["DATE"] == "2020/07/15") & (recreational_beach_monitoring_p3["FIELD_NUMBER"] =="1448-20-29041")].index)

recreational_beach_monitoring_p3.shape

In [None]:
recreational_beach_monitoring_p3 = drop_empty_columns_dataset(recreational_beach_monitoring_p3)

In [None]:
recreational_beach_monitoring_p3.columns.tolist()

In [None]:
#print(len(recreational_beach_monitoring_p3["ALK_T_RPC_LAB_FLAG"]) - recreational_beach_monitoring_p3["ALK_T_RPC_LAB_FLAG"].isna().sum())
#print(len(recreational_beach_monitoring_p3["ALK_T_ENV_LAB_FLAG"]) - recreational_beach_monitoring_p3["ALK_T_ENV_LAB_FLAG"].isna().sum())
#print(len(recreational_beach_monitoring_p3["ALK_T_ENV_LAB_FLAG"]))

In [None]:
def check_null_groups(arr_values_analytes,dateval,station_id,source_of_val,empty_val):
    
    checknull = 0
    analyte_val = np.nan
    source_name = ""
    for i in range(len(arr_values_analytes)):
        if(str(arr_values_analytes[i]).strip() == "" or pd.isnull(arr_values_analytes[i])==True):
            checknull += 1
        else:
            analyte_val =  arr_values_analytes[i]
            source_name = source_of_val[i]
    
    if(len(arr_values_analytes)-checknull == 0):
        #print("null group" + str(dateval))
        pass
    elif(len(arr_values_analytes)-checknull == 1):
        return str(analyte_val) + "("+source_name +")"
    else:
        print("issue in group" + str(dateval) +"||" + str(station_id))
    
    return empty_val

#check_null_groups_vec = np.vectorize(check_null_groups)

In [None]:
recreational_beach_monitoring_p3["AIR_TEMP_FIELD"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["AIR_TEMP_FIELD_ENV"],x["AIR_TEMP_FIELD_ENV_1"]],x["DATE"],x["STATION_ID"],["ENV Field","ENV Field.1"],np.nan),axis=1)
recreational_beach_monitoring_p3["E_COLI_MPN"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["E_COLI_MPN_RPC_LAB"],x["E_COLI_MPN_RPC_LAB_1"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.1"],np.nan),axis=1)
recreational_beach_monitoring_p3["E_COLI_MPN_FLAG"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["E_COLI_MPN_RPC_LAB_FLAG"],x["E_COLI_MPN_RPC_LAB_FLAG_1"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.1"],np.nan),axis=1)
recreational_beach_monitoring_p3["EC_MF"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["EC_MF_RPC_LAB"],x["EC_MF_RPC_LAB_2"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.2"],np.nan),axis=1)
recreational_beach_monitoring_p3["EC_MF_FLAG"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["EC_MF_RPC_LAB_FLAG"],x["EC_MF_RPC_LAB_FLAG_1"],x["EC_MF_RPC_LAB_FLAG_2"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.1","RPC Lab.2"],np.nan),axis=1)
recreational_beach_monitoring_p3["ENT_MPN"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["ENT/MPN_RPC_LAB"],x["ENT/MPN_RPC_LAB_1"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.1"],np.nan),axis=1)
recreational_beach_monitoring_p3["PH_FIELD"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["PH_CLIENT_FLD"],x["PH_FIELD_ENV"],x["PH_FIELD_ENV_1"]],x["DATE"],x["STATION_ID"],["Client Field","ENV Field","ENV Field.1"],np.nan),axis=1)
recreational_beach_monitoring_p3["TEMP_FIELD"] = recreational_beach_monitoring_p3.apply(lambda x: check_null_groups([x["TEMP_FIELD_ENV"],x["TEMP_FIELD_ENV_1"],x["TEMP_UNKNOWN"]],x["DATE"],x["STATION_ID"],["ENV Field","ENV Field.1","Unknown"],np.nan),axis=1)

In [None]:
#Separate source columns

def separate_source_columns(raw_value):
    #print(raw_value)
    if(pd.isnull(raw_value)==False):
        #print(raw_value)
        source_only = re.findall('\((.*?)\)',raw_value)
        source_only = source_only[0] if len(source_only) > 0 else source_only
        #print(source_only)
        return source_only
    else:
        return "" 
        
separate_source_columns_vec = np.vectorize(separate_source_columns)

In [None]:
recreational_beach_monitoring_p3["AIR_TEMP_SOURCE"] = separate_source_columns_vec(recreational_beach_monitoring_p3["AIR_TEMP_FIELD"])
recreational_beach_monitoring_p3["E_COLI_MPN_SOURCE"] = separate_source_columns_vec(recreational_beach_monitoring_p3["E_COLI_MPN"])
recreational_beach_monitoring_p3["EC_MF_SOURCE"] = separate_source_columns_vec(recreational_beach_monitoring_p3["EC_MF"])
recreational_beach_monitoring_p3["ENT_MPN_SOURCE"] = separate_source_columns_vec(recreational_beach_monitoring_p3["ENT_MPN"])
recreational_beach_monitoring_p3["PH_FIELD_SOURCE"] = separate_source_columns_vec(recreational_beach_monitoring_p3["PH_FIELD"])
recreational_beach_monitoring_p3["TEMP_FIELD_SOURCE"] = separate_source_columns_vec(recreational_beach_monitoring_p3["TEMP_FIELD"])

In [None]:
#Remove source data from analytes

def remove_source_name(raw_value):
    if(pd.isnull(raw_value)==False):
        #print(raw_value)
        value_cleaned = re.sub('\((.*?)\)','',raw_value)
        #print(value_cleaned)
        return value_cleaned
    else:
        return ""  
        
remove_source_name_vec = np.vectorize(remove_source_name)

In [None]:
recreational_beach_monitoring_p3["AIR_TEMP_FIELD"] = remove_source_name_vec(recreational_beach_monitoring_p3["AIR_TEMP_FIELD"])
recreational_beach_monitoring_p3["E_COLI_MPN"] = remove_source_name_vec(recreational_beach_monitoring_p3["E_COLI_MPN"])
recreational_beach_monitoring_p3["E_COLI_MPN_FLAG"] = remove_source_name_vec(recreational_beach_monitoring_p3["E_COLI_MPN_FLAG"])
recreational_beach_monitoring_p3["EC_MF"] = remove_source_name_vec(recreational_beach_monitoring_p3["EC_MF"])
recreational_beach_monitoring_p3["EC_MF_FLAG"] = remove_source_name_vec(recreational_beach_monitoring_p3["EC_MF_FLAG"])
recreational_beach_monitoring_p3["ENT_MPN"] = remove_source_name_vec(recreational_beach_monitoring_p3["ENT_MPN"])
recreational_beach_monitoring_p3["PH_FIELD"] = remove_source_name_vec(recreational_beach_monitoring_p3["PH_FIELD"])
recreational_beach_monitoring_p3["TEMP_FIELD"] = remove_source_name_vec(recreational_beach_monitoring_p3["TEMP_FIELD"])

In [None]:
recreational_beach_monitoring_p3.columns.tolist()

In [None]:
#Rename some flag columns

recreational_beach_monitoring_p3 = recreational_beach_monitoring_p3.rename(columns={"ENT/MPN_RPC_LAB_FLAG":"ENT_MPN_FLAG"})


In [None]:
#Create Missing Flag Columns

recreational_beach_monitoring_p3['AIR_TEMP_FIELD_FLAG'] = ""
recreational_beach_monitoring_p3['COND_FIELD_ENV_FLAG'] = ""
recreational_beach_monitoring_p3['DO_FIELD_ENV_FLAG'] = ""
recreational_beach_monitoring_p3['DOHBCHADV_RPC_LAB_FLAG'] = ""
recreational_beach_monitoring_p3['DOHBCHRAIN_RPC_LAB_FLAG'] = ""
recreational_beach_monitoring_p3['PH_FIELD_FLAG'] = ""
recreational_beach_monitoring_p3['TEMP_FIELD_FLAG'] = ""
recreational_beach_monitoring_p3['TURB_FIELD_ENV_FLAG'] = ""


In [None]:
recreational_beach_monitoring_p3.columns.tolist()

In [None]:
cols = ['E_COLI_MPN_RPC_LAB_FLAG', 'E_COLI_MPN_RPC_LAB_FLAG_1', 'EC_MF_RPC_LAB_FLAG', 'EC_MF_RPC_LAB_FLAG_1', 'EC_MF_RPC_LAB_FLAG_2', 'ENT_MPN_FLAG',  'E_COLI_MPN_FLAG', 'EC_MF_FLAG', 'AIR_TEMP_FIELD_FLAG', 'COND_FIELD_ENV_FLAG', 'DO_FIELD_ENV_FLAG', 'DOHBCHADV_RPC_LAB_FLAG', 'DOHBCHRAIN_RPC_LAB_FLAG',      'TEMP_FIELD_FLAG']

recreational_beach_monitoring_p3[cols] = recreational_beach_monitoring_p3[cols].replace(np.nan,"")

In [None]:
#Empty String to NaN

cols = ['AIR_TEMP_FIELD', 'AIR_TEMP_FIELD_ENV', 'AIR_TEMP_FIELD_ENV_1',  'COND_FIELD_ENV', 'DO_FIELD_ENV', 'DOHBCHADV_RPC_LAB', 'DOHBCHRAIN_RPC_LAB', 'E_COLI_MPN', 'E_COLI_MPN_RPC_LAB', 'E_COLI_MPN_RPC_LAB_1', 'EC_MF', 'EC_MF_RPC_LAB', 'EC_MF_RPC_LAB_2', 'ENT/MPN_RPC_LAB', 'ENT/MPN_RPC_LAB_1', 'ENT_MPN',  'PH_CLIENT_FLD', 'PH_FIELD', 'PH_FIELD_ENV', 'PH_FIELD_ENV_1', 'TEMP_FIELD', 'TEMP_FIELD_ENV', 'TEMP_FIELD_ENV_1', 'TEMP_UNKNOWN', 'TURB_FIELD_ENV']

recreational_beach_monitoring_p3[cols] = recreational_beach_monitoring_p3[cols].replace("",np.nan)

In [None]:
print(recreational_beach_monitoring_p3['E_COLI_MPN_FLAG'].unique())

In [None]:

def create_missing_flag_col(non_numeric_value,flag_val):
    if(pd.isnull(non_numeric_value)==False and (bool(re.search("^-?[0-9]\d*(\.\d+)?$", str(non_numeric_value)))==False and non_numeric_value !=np.nan)):
        return non_numeric_value
            
    return flag_val

create_missing_flag_col_vec = np.vectorize(create_missing_flag_col)

In [None]:
recreational_beach_monitoring_p3['AIR_TEMP_FIELD_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['AIR_TEMP_FIELD'],recreational_beach_monitoring_p3['AIR_TEMP_FIELD_FLAG'])
recreational_beach_monitoring_p3['COND_FIELD_ENV_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['COND_FIELD_ENV'],recreational_beach_monitoring_p3['COND_FIELD_ENV_FLAG'])
recreational_beach_monitoring_p3['DO_FIELD_ENV_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['DO_FIELD_ENV'],recreational_beach_monitoring_p3['DO_FIELD_ENV_FLAG'])
recreational_beach_monitoring_p3['DOHBCHADV_RPC_LAB_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['DOHBCHADV_RPC_LAB'],recreational_beach_monitoring_p3['DOHBCHADV_RPC_LAB_FLAG'])
recreational_beach_monitoring_p3['DOHBCHRAIN_RPC_LAB_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['DOHBCHRAIN_RPC_LAB'],recreational_beach_monitoring_p3['DOHBCHRAIN_RPC_LAB_FLAG'])
recreational_beach_monitoring_p3['E_COLI_MPN_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['E_COLI_MPN'],recreational_beach_monitoring_p3['E_COLI_MPN_FLAG'])
recreational_beach_monitoring_p3['EC_MF_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['EC_MF'],recreational_beach_monitoring_p3['EC_MF_FLAG'])
recreational_beach_monitoring_p3['ENT_MPN_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['ENT_MPN'],recreational_beach_monitoring_p3['ENT_MPN_FLAG'])
recreational_beach_monitoring_p3['PH_FIELD_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['PH_FIELD'],recreational_beach_monitoring_p3['PH_FIELD_FLAG'])
recreational_beach_monitoring_p3['TEMP_FIELD_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['TEMP_FIELD'],recreational_beach_monitoring_p3['TEMP_FIELD_FLAG'])
recreational_beach_monitoring_p3['TURB_FIELD_ENV_FLAG'] = create_missing_flag_col_vec(recreational_beach_monitoring_p3['TURB_FIELD_ENV'],recreational_beach_monitoring_p3['TURB_FIELD_ENV_FLAG'])

In [None]:
recreational_beach_monitoring_p4 = recreational_beach_monitoring_p3.copy()

In [None]:
#Remove RPV from analyte columns

recreational_beach_monitoring_p4[cols] = recreational_beach_monitoring_p4[cols].replace("RPV",np.nan)
recreational_beach_monitoring_p4[cols] = recreational_beach_monitoring_p4[cols].astype(float)

In [None]:
#Remove NT from flag columns as per the Water Sciences Team's suggestion

cols = ['E_COLI_MPN_FLAG', 'ENT_MPN_FLAG', 'EC_MF_FLAG', 'TURB_FIELD_ENV_FLAG', 'TEMP_FIELD_FLAG', 'AIR_TEMP_FIELD_FLAG']

recreational_beach_monitoring_p4[cols] = recreational_beach_monitoring_p4[cols].replace("NT",np.nan)

In [None]:
#Convert empty srting to nan values for flags 

cols = [ 'E_COLI_MPN_RPC_LAB_FLAG', 'E_COLI_MPN_RPC_LAB_FLAG_1', 'EC_MF_RPC_LAB_FLAG', 'EC_MF_RPC_LAB_FLAG_1', 'EC_MF_RPC_LAB_FLAG_2', 'ENT_MPN_FLAG',   'E_COLI_MPN_FLAG', 'EC_MF_FLAG', 'AIR_TEMP_FIELD_FLAG',  'COND_FIELD_ENV_FLAG',  'DO_FIELD_ENV_FLAG', 'DOHBCHADV_RPC_LAB_FLAG', 'DOHBCHRAIN_RPC_LAB_FLAG',   'PH_FIELD_FLAG', 'TEMP_FIELD_FLAG', 'TURB_FIELD_ENV_FLAG']

recreational_beach_monitoring_p4[cols] = recreational_beach_monitoring_p4[cols].replace("",np.nan)

In [None]:
cols = ["AIR_TEMP_SOURCE", "E_COLI_MPN_SOURCE", "EC_MF_SOURCE", "ENT_MPN_SOURCE", "PH_FIELD_SOURCE", "TEMP_FIELD_SOURCE"]

recreational_beach_monitoring_p4[cols] = recreational_beach_monitoring_p4[cols].replace(".1","", regex=True)
recreational_beach_monitoring_p4[cols] = recreational_beach_monitoring_p4[cols].replace(".2","", regex=True)

In [None]:
#Drop null columns
recreational_beach_monitoring_p4 = drop_empty_columns_dataset(recreational_beach_monitoring_p4)

In [None]:
#Drop duplicate columns 
cols = ['AIR_TEMP_FIELD_ENV', 'AIR_TEMP_FIELD_ENV_1', 'E_COLI_MPN_RPC_LAB', 'E_COLI_MPN_RPC_LAB_1', 'E_COLI_MPN_RPC_LAB_FLAG', 'E_COLI_MPN_RPC_LAB_FLAG_1', 'EC_MF_RPC_LAB', 'EC_MF_RPC_LAB_2', 'EC_MF_RPC_LAB_FLAG', 'EC_MF_RPC_LAB_FLAG_1', 'EC_MF_RPC_LAB_FLAG_2', 'ENT/MPN_RPC_LAB', 'ENT/MPN_RPC_LAB_1', 'PH_FIELD_ENV', 'PH_FIELD_ENV_1', 'TEMP_UNKNOWN', 'TEMP_FIELD_ENV', 'TEMP_FIELD_ENV_1']

#recreational_beach_monitoring_p4 = recreational_beach_monitoring_p4.drop(cols, axis=1)


In [None]:
#Drop columns suggested by the Water Sciences team

cols = ['DOHBCHADV_RPC_LAB','TEMP_FIELD_SOURCE','PH_FIELD_SOURCE','TEMP_FIELD_ENV','PH_FIELD_ENV','FIELD_NUMBER']

recreational_beach_monitoring_p4 = recreational_beach_monitoring_p4.drop(cols, axis=1)

In [None]:
#Rename columns suggested by the Water Sciences team

recreational_beach_monitoring_p4 = recreational_beach_monitoring_p4.rename(columns={"TEMP_FIELD":"TEMP_FIELD_ENV", "TEMP_FIELD_FLAG":"TEMP_FIELD_ENV_FLAG", "PH_FIELD":"PH_FIELD_ENV"})


In [None]:
#Rearrange columns

cols = ['STATION_NAME', 'STATION_ID', 'DATE', 'YEAR', 'LATITUDE', 'LONGITUDE', 'E_COLI_MPN', 'E_COLI_MPN_FLAG', 'ENT_MPN', 'ENT_MPN_FLAG', 'EC_MF', 'EC_MF_FLAG', 'TURB_FIELD_ENV', 'TEMP_FIELD_ENV', 'AIR_TEMP_FIELD', 'COND_FIELD_ENV', 'DO_FIELD_ENV', 'DOHBCHRAIN_RPC_LAB', 'PH_FIELD_ENV']

recreational_beach_monitoring_p4 = recreational_beach_monitoring_p4[cols]

In [None]:
recreational_beach_monitoring_p4.columns.tolist()

##### Create a final copy of processed data

In [None]:
recreational_beach_monitoring = recreational_beach_monitoring_p4.copy()

In [None]:
#Export Combined Dataset to a CSV

recreational_beach_monitoring.to_csv("data/recreational_beach_monitoring.csv", sep=',',index=False,encoding='utf-8-sig')

#Shape of row data
recreational_beach_monitoring.shape

In [None]:
recreational_beach_monitoring.info()

In [None]:
recreational_beach_monitoring.columns.tolist()

#### Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Null Values 

In [None]:

#Plot null values
def plot_null_values(dataset,group_by,year_filter_switch, year_filter):
    if year_filter_switch:
        dataset = dataset[(dataset['YEAR'] == year_filter)]
    
    dataset = dataset.groupby([group_by])
    # extract keys from groups
    keys = dataset.groups.keys()

    totalCols=1
    totalRows=math.ceil(len(dataset)/totalCols)
    
    fig = plt.figure(figsize=((totalCols+3)*4,(totalRows+1)*5))
    plt.subplots_adjust(wspace=0.2, hspace=0.7)

    for index, x in enumerate(keys):
        null_columns = []
        null_column_values = []

        #print(dataset.get_group(x).columns[2])

        for i in range(0,len(dataset.get_group(x).columns),1):
            if(dataset[dataset.get_group(x).columns[i]].get_group(x).isna().sum() > 0):
                null_columns.append(dataset.get_group(x).columns[i])
                null_column_values.append(dataset[dataset.get_group(x).columns[i]].get_group(x).isna().sum())

        globals()[f"ax_count_plots_{index}"] = fig.add_subplot(totalRows,totalCols,(index+1))
        globals()[f"ax_count_plots_{index}"].set_title(x.upper(),backgroundcolor='gray')
        

        globals()[f"ax_count_plots_{index}"].bar(null_columns,null_column_values)
        
        globals()[f"ax_count_plots_{index}"].set(xlabel=None)
        globals()[f"ax_count_plots_{index}"].tick_params(axis='x', labelrotation = 90)
        
    plt.show()


In [None]:
plot_null_values(recreational_beach_monitoring,'STATION_NAME',False,2010) #false if dont want to use year filter

In [None]:
#Check for null values
pd.set_option('display.max_rows',None)
#pd.set_option('display.max_columns', None)
recreational_beach_monitoring.isna().sum()

In [None]:
pd.reset_option('display.max_rows')

#### Analyse variables 

In [None]:

#This is just an example of a line graph, visualization can be better
def linechart_of_categories(dataset,group_by,time_column,value_column,year_filter_switch,year_filter):
    if year_filter_switch:
        dataset = dataset[(dataset['YEAR'] == year_filter)]

    dataset = dataset.set_index(time_column)
    dataset = dataset.groupby([group_by])
    # extract keys from groups
    keys = dataset.groups.keys()
    totalRows = 0
    for index, x in enumerate(keys):
        if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            totalRows+=1
    
    totalCols=3
    totalRows=math.ceil(totalRows/totalCols)
    
    fig = plt.figure(figsize=((totalCols+3)*3,(totalRows+1)*5))
    plt.subplots_adjust(wspace=0.2, hspace=0.6)
    newInx = 1
    for index, x in enumerate(keys):
        if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            globals()[f"ax_count_plots_{index}"] = fig.add_subplot(totalRows,totalCols,newInx)
            globals()[f"ax_count_plots_{index}"].set_title(x.upper())
            #if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            dataset[value_column].get_group(x).plot()
            
            globals()[f"ax_count_plots_{index}"].set(xlabel=None)
            globals()[f"ax_count_plots_{index}"].tick_params(axis='x', labelrotation = 90)
            newInx+=1
    plt.show()


In [None]:
linechart_of_categories(recreational_beach_monitoring,'STATION_NAME','DATE','E_COLI_MPN',False,2022)

In [None]:
#Display unique values

def unique_values__or_count(listOfColumns,options,dataset):
    for x in range(0, len(listOfColumns), 1):
        if(options=="unique"):
            unique_values_str = dataset[listOfColumns[x]].unique()
            print("unique_values " + listOfColumns[x])
            print(unique_values_str)
            print("------------------------")
        if(options=="count"):
            values_distribution = dataset[listOfColumns[x]].value_counts()
            print("-----------"+listOfColumns[x] +"------------")
            print(values_distribution)
            print("-----------------------")

In [None]:
#Check station values

unique_values__or_count(['ENT_MPN_FLAG'],"unique",recreational_beach_monitoring)

#"AIR_TEMP_SOURCE", "E_COLI_MPN_SOURCE", "EC_MF_SOURCE", "ENT_MPN_SOURCE", "PH_FIELD_SOURCE", "TEMP_FIELD_SOURCE"


#### Data Validation

In [None]:
#Import cleaned data 

recreational_beach_monitoring_validate = pd.read_csv("data/recreational_beach_monitoring.csv", low_memory=False)

In [None]:
recreational_beach_monitoring_validate.columns.tolist()

In [None]:
Col_name_to_validate = "E_COLI_MPN"

In [None]:
recreational_beach_monitoring_validate["DATE"] = pd.to_datetime(recreational_beach_monitoring_validate["DATE"],format='%Y/%m/%d')
recreational_beach_monitoring_validate = recreational_beach_monitoring_validate[["STATION_NAME", "DATE", Col_name_to_validate]].copy()

In [None]:
#Import original data 

recreational_beach_monitoring_original = pd.read_csv("raw_data/2000-2022.csv", low_memory=False, sep=",")


In [None]:
print("List of Columns")
print(recreational_beach_monitoring_original.columns.to_list())
print("---------------")

In [None]:
col_to_validate_with = "E_coli-MPN (MPN/100ml) RPC-Lab"

In [None]:
#Rename columns 
recreational_beach_monitoring_original.rename(columns={'Station': 'STATION_NAME', 'FromDate': 'DATE',col_to_validate_with:Col_name_to_validate}, inplace=True)

#recreational_beach_monitoring_original = recreational_beach_monitoring_original.rename(columns=lambda x: clean_column_names(x)[0])

In [None]:
#Change date format
recreational_beach_monitoring_original["DATE"] = pd.to_datetime(recreational_beach_monitoring_original["DATE"],format='%Y/%m/%d')

#Trim data to validate an analyte
recreational_beach_monitoring_original = recreational_beach_monitoring_original[["STATION_NAME", "DATE", Col_name_to_validate]].copy()

In [None]:
recreational_beach_monitoring_validate_results = pd.merge(recreational_beach_monitoring_validate, recreational_beach_monitoring_original, on=["STATION_NAME", "DATE",Col_name_to_validate], how='right', indicator='Exist')
recreational_beach_monitoring_validate_results['Exist'] = np.where(recreational_beach_monitoring_validate_results.Exist == 'both', True, False)

In [None]:
unique_values__or_count(['Exist'],"count",recreational_beach_monitoring_validate_results)

In [None]:
list_of_missing_rows = recreational_beach_monitoring_validate_results[(recreational_beach_monitoring_validate_results['Exist'] == False)].copy()

list_of_missing_rows.head(5)

#ist_of_missing_rows.to_csv("data/temp.csv", sep=',',index=False,encoding='utf-8-sig')