# Surface Water Monitoring Network

## Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
import math
import os
import glob
import datetime
import re
#import openpyxl
#import xlrd

In [None]:
#Read CSV files from a folder

current_directory = str(os.getcwd()) + "\\raw_data\\"
dataframes = []

all_files = []

for path, subdirs, files in os.walk(current_directory):
    for name in files:
        file_name = os.path.join(path, name)
        format_matches = [".csv"]
        exclue_matches = []
        if name not in all_files:
            if any([x in name for x in format_matches]):
                if not any([y in file_name for y in exclue_matches]):
                    try:
                        current_dataframe = pd.read_csv(file_name, low_memory=False,sep=",")
                        dataframes.append(current_dataframe)

                        pass
                    except Exception as e:
                        print("Error reading file: " + file_name)
                        print(e)
                else:
                    print("Files Excluded : " + file_name)
            else:
                print("Non Excel File: " + file_name)
        
        all_files.append(name)
all_files = []


In [None]:
dataframes[0].head(1)

In [None]:
dataframes[1].head(1)

In [None]:
dataframes[3].head(1)

In [None]:
dataframes[3].info()

In [None]:
#Check if dataframes have the same columns

if all([set(dataframes[0].columns) == set(df.columns) for df in dataframes]):
    print('Datasets have the same columns')
else:
    print('Datasets do not have the same columns')


In [None]:
#Find the columns names that have found in some columns but not in others. This way we can create those columns for all the dataframes

columns = []
    
for x in range(0, len(dataframes), 1):
    for y in range(0, len(dataframes), 1):
        for z in range(0, len(dataframes[x].columns), 1):
            #print(str(z) + "||"+ str(len(dataframes[y].columns))+ "||" + str(y))
            if(dataframes[x].columns[z] in dataframes[y].columns):
                pass
            else:
                if (dataframes[x].columns[z] in columns):
                    pass
                else:
                    columns.append(dataframes[x].columns[z])
                
print(columns)

In [None]:
#Combine all the dataframes into one

nb_surface_water_monitoring_raw = pd.concat(dataframes)
nb_surface_water_monitoring_raw.tail(5)

In [None]:
#Export Combined Dataset to a CSV

nb_surface_water_monitoring_raw.to_csv("data/nb_surface_water_monitoring_raw.csv", sep=',',index=False,encoding='utf-8-sig')

#Shape of row data
nb_surface_water_monitoring_raw.shape

In [None]:
#Create a copy of the dataset

nb_surface_water_monitoring_p1 = nb_surface_water_monitoring_raw.copy()
nb_surface_water_monitoring_p1.head(5)

In [None]:
# Replace -9999 with NaN
#nb_air_quality_p1 = nb_air_quality_p1.replace(-9999,np.nan)

In [None]:
def find_datetime_format(dt_str):
    formats_to_check = [
        '%Y/%m/%d %I:%M:%S %p',
        '%Y-%m-%d %I:%M:%S %p',
        '%Y/%m/%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S',
        '%d/%m/%Y %I:%M:%S %p',
        '%d-%m-%Y %I:%M:%S %p',
        '%d/%m/%Y %H:%M:%S',
        '%d-%m-%Y %H:%M:%S',
        '%Y/%m/%d',
        '%Y-%m-%d',
        '%d/%m/%Y',
        '%d-%m-%Y',
    ]

    for fmt in formats_to_check:
        try:
            datetime.datetime.strptime(dt_str, fmt)
            return fmt
        except ValueError:
            pass

    return None

# Example usage
dt_str = "2004/03/04"
format_found = find_datetime_format(dt_str)
if format_found:
    print(f"Format found: {format_found}")
else:
    print("Format not found")


In [None]:
#nb_air_quality_p1["DATE_TIME"] = nb_air_quality["DATE_TIME"].replace('24:00','00:00' , regex=True)

nb_surface_water_monitoring_p1["FromDate"] = pd.to_datetime(nb_surface_water_monitoring_p1["FromDate"],format='%Y/%m/%d')

nb_surface_water_monitoring_p1["YEAR"] = nb_surface_water_monitoring_p1["FromDate"].dt.year

In [None]:
nb_surface_water_monitoring_p1.head(5)

#### Station information

In [None]:
#Rename two columns before station informtion
nb_surface_water_monitoring_p1.rename(columns={'Station': 'STATION_NAME', 'FromDate': 'DATE'}, inplace=True)

#Import data 

station_information = pd.read_csv("data/surface-water-monitoring-stations.csv")

In [None]:
# Convert DMS (degrees, minutes, seconds) to DD (decimal degrees)
def dms2dd(degrees, minutes, seconds, direction):
    dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)
    if direction == 'S' or direction == 'W':
        dd *= -1
    return dd

def dd2dms(deg):
    d = int(deg)
    md = abs(deg - d) * 60
    m = int(md)
    sd = (md - m) * 60
    return [d, m, sd]

def parse_dms(dms,latlong):
    if(pd.isnull(dms) != True):
        #print(dms)
        dms=dms.replace('"','')
        degDirection, minutes, seconds = re.split('[˚\']', dms)
        direction,deg = re.split('[\s]', degDirection)
        #print(deg, minutes, seconds, direction)
        latLng = dms2dd(deg, minutes, seconds, direction)

        return (latLng)
    else:
        return latlong

#dd = parse_dms("36°57'9' N 110°4'21' W")

#print(parse_dms("W 67˚ 44' 01.3",np.nan))


In [None]:
station_information["LATITUDE"] = station_information.apply(lambda x: parse_dms(x["DMS_LATITUDE"],x["LATITUDE"]),axis=1)
station_information["LONGITUDE"] = station_information.apply(lambda x: parse_dms(x["DMS_LONGITUDE"],x["LONGITUDE"]),axis=1)

#Export stations to a CSV

station_information.to_csv("data/surface-water-monitoring-stations.csv", sep=',',index=False,encoding='utf-8-sig')

In [None]:
#Attach station information

nb_surface_water_monitoring_p1 = pd.merge(nb_surface_water_monitoring_p1, station_information[["STATION_ID","STATION_NAME","LATITUDE","LONGITUDE"]],  how='left', left_on=['STATION_NAME'], right_on = ['STATION_NAME'])

nb_surface_water_monitoring_p1.head(5)

In [None]:
# Replace -9999 with NaN
#nb_surface_water_monitoring_p1 = nb_surface_water_monitoring_p1.replace(-9999,np.nan)

In [None]:
#Check for null values
pd.set_option('display.max_rows',None)
#pd.set_option('display.max_columns', None)
nb_surface_water_monitoring_p1.isna().sum()

In [None]:
pd.reset_option('display.max_rows')
#pd.reset_option('display.max_columns')

#### Create another copy of the dataset for futher pre-processing

Some methods are slow when processing data. Creating a copy of a dataset will allow us not to run the entire code during data development. 

In [None]:
nb_surface_water_monitoring_p2 = nb_surface_water_monitoring_p1.copy()

#### Check and remove null columns

In [None]:
def drop_empty_columns_dataset(dataset):
    indexes = []
    for i in range(0,len(dataset.columns),1):
        if(len(dataset[dataset.columns[i]])==dataset[dataset.columns[i]].isna().sum()):
            indexes.append(dataset.columns[i])
            print(dataset.columns[i])
   
    dataset.drop(indexes,inplace=True, axis=1)
    return dataset

In [None]:
nb_surface_water_monitoring_p2 = drop_empty_columns_dataset(nb_surface_water_monitoring_p2) #finish this later when we have whole data

In [None]:
nb_surface_water_monitoring_p2.columns.tolist()

##### Remove unit information field value where there is no analyte value

In [None]:
def remove_unit_from_empty(unitVal, unitName):
    if(unitVal == "" or math.isnan(unitVal)):
        return np.nan
    return unitName


In [None]:
""" nb_surface_water_monitoring_p2["SO2_INFO"] = nb_surface_water_monitoring_p2.apply(lambda x: remove_unit_from_empty(x["SO2"],x["SO2_INFO"]),axis=1) """

In [None]:
#nb_surface_water_monitoring_p2.columns.tolist()

In [None]:
def clean_column_names(column_name):
    unit_only = re.findall('\((.*?)\)',column_name)
    unit_only = unit_only[0] if len(unit_only) > 0 else unit_only
    column_name_cleaned = re.sub('\((.*?)\)','',column_name)
    column_name_cleaned = column_name_cleaned.replace(" - ", "-")
    column_name_cleaned = column_name_cleaned.replace("  ", "_")
    column_name_cleaned = column_name_cleaned.replace(" ", "_")
    column_name_cleaned = column_name_cleaned.replace("-", "_")
    column_name_cleaned = column_name_cleaned.replace(".", "_")
    column_name_cleaned = column_name_cleaned.replace(",", "_")
    column_name_cleaned = column_name_cleaned.upper()
    return [column_name_cleaned, unit_only]

In [None]:
# Get units from columns and store in a dataframe

nb_surface_water_monitoring_units = pd.DataFrame(columns = ['UNIT_NAME_ORIGINAL','UNIT_NAME_CLEANED', 'UNIT'])

non_unit_columns = ['STATION_NAME', 'DATE', 'Subprg', 'Field Number', 'Medium Code', 'Medium Desc', 'YEAR', 'LATITUDE', 'LONGITUDE','Flag','STATION_ID']
# append rows to an empty DataFrame
for i in range(0,len(nb_surface_water_monitoring_p2.columns),1):
    if not any([y in nb_surface_water_monitoring_p2.columns[i] for y in non_unit_columns]):
        #print(clean_column_names(nb_surface_water_monitoring_p2.columns[i]))
        nb_surface_water_monitoring_units_row = pd.DataFrame({'UNIT_NAME_ORIGINAL':nb_surface_water_monitoring_p2.columns[i],'UNIT_NAME_CLEANED':clean_column_names(nb_surface_water_monitoring_p2.columns[i])[0], 'UNIT':clean_column_names(nb_surface_water_monitoring_p2.columns[i])[1]}, index=[i])
        nb_surface_water_monitoring_units = pd.concat([nb_surface_water_monitoring_units, nb_surface_water_monitoring_units_row])

nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_@B_ENV_LAB","B_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_=TDS_ENV_LAB","TDS_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_=TDS_RPC_LAB","TDS_RPC_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_@C_ENV_LAB","C_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_=COND_ENV_LAB","COND_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_AN_ENV_LAB","AN_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_CAT_ENV_LAB","CAT_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_CO3_ENV_LAB","CO3_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_DIFB_ENV_LAB","DIFB_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_DIFC_ENV_LAB","DIFC_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_DIFTDS_ENV_LAB","DIFTDS_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_HCO3_ENV_LAB","HCO3_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_OH_ENV_LAB","OH_ENV_LAB_CALC")
nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"] = nb_surface_water_monitoring_units["UNIT_NAME_CLEANED"].replace("Þ_SIN_ENV_LAB","SIN_ENV_LAB_CALC")

#Export Combined Dataset to a CSV

nb_surface_water_monitoring_units.to_csv("data/surface-water-monitoring-units.csv", sep=',',index=False,encoding='utf-8-sig')

nb_surface_water_monitoring_units.head(5)

In [None]:
#Rename all columns

nb_surface_water_monitoring_p2 = nb_surface_water_monitoring_p2.rename(columns=lambda x: clean_column_names(x)[0])

In [None]:
#Manually rename calculated variables

nb_surface_water_monitoring_p2 = nb_surface_water_monitoring_p2.rename(columns={"Þ_@B_ENV_LAB":"B_ENV_LAB_CALC", "Þ_=TDS_ENV_LAB":"TDS_ENV_LAB_CALC", "Þ_=TDS_RPC_LAB":"TDS_RPC_LAB_CALC", "Þ_@C_ENV_LAB":"C_ENV_LAB_CALC", "Þ_=COND_ENV_LAB":"COND_ENV_LAB_CALC", "Þ_AN_ENV_LAB":"AN_ENV_LAB_CALC", "Þ_CAT_ENV_LAB":"CAT_ENV_LAB_CALC", "Þ_CO3_ENV_LAB":"CO3_ENV_LAB_CALC", "Þ_DIFB_ENV_LAB":"DIFB_ENV_LAB_CALC", "Þ_DIFC_ENV_LAB":"DIFC_ENV_LAB_CALC", "Þ_DIFTDS_ENV_LAB":"DIFTDS_ENV_LAB_CALC", "Þ_HCO3_ENV_LAB":"HCO3_ENV_LAB_CALC", "Þ_OH_ENV_LAB":"OH_ENV_LAB_CALC", "Þ_SIN_ENV_LAB":"SIN_ENV_LAB_CALC"})
 

In [None]:
#Manually drop columns 

nb_surface_water_monitoring_p2 = nb_surface_water_monitoring_p2.drop(['SUBPRG','MEDIUM_CODE','MEDIUM_DESC'], axis=1)


In [None]:
nb_surface_water_monitoring_p2.columns.tolist()

In [None]:
#Round the Coulmns to 1 decimal point

#cols = ['AL_ENV_LAB', 'ALK_G_ENV_LAB']

#nb_surface_water_monitoring_p2[cols] = nb_surface_water_monitoring_p2[cols].round(2)

##### Merge multiple source data 

In [None]:
nb_surface_water_monitoring_p3 = nb_surface_water_monitoring_p2.copy()

In [None]:
nb_surface_water_monitoring_p3.columns.tolist()

In [None]:
#print(len(nb_surface_water_monitoring_p3["ALK_T_RPC_LAB_FLAG"]) - nb_surface_water_monitoring_p3["ALK_T_RPC_LAB_FLAG"].isna().sum())
#print(len(nb_surface_water_monitoring_p3["ALK_T_ENV_LAB_FLAG"]) - nb_surface_water_monitoring_p3["ALK_T_ENV_LAB_FLAG"].isna().sum())
#print(len(nb_surface_water_monitoring_p3["ALK_T_ENV_LAB_FLAG"]))

In [None]:
def check_null_groups(arr_values_analytes,dateval,station_id,source_of_val,empty_val):
    
    checknull = 0
    analyte_val = np.nan
    source_name = ""
    for i in range(len(arr_values_analytes)):
        if(str(arr_values_analytes[i]).strip() == "" or pd.isnull(arr_values_analytes[i])==True):
            checknull += 1
        else:
            analyte_val =  arr_values_analytes[i]
            source_name = source_of_val[i]
    
    if(len(arr_values_analytes)-checknull == 0):
        #print("null group" + str(dateval))
        pass
    elif(len(arr_values_analytes)-checknull == 1):
        return str(analyte_val) + "("+source_name +")"
    else:
        print("issue in group" + str(dateval) +"||" + str(station_id))
    
    return empty_val

#check_null_groups_vec = np.vectorize(check_null_groups)

In [None]:
nb_surface_water_monitoring_p3["AL"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["AL_ENV_LAB"],x["AL_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["AIR_TEMP_FIELD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x['AIR_TEMP_FIELD_ENV'],x['AIR_TEMP_FIELD_ENV_1']],x["DATE"],x["STATION_ID"],["ENV Field","RPC Field.1"],np.nan),axis=1)
nb_surface_water_monitoring_p3["ALK_T"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["ALK_T_ENV_LAB"],x["ALK_T_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["ALK_T_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["ALK_T_ENV_LAB_FLAG"],x["ALK_T_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["AS"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["AS_ENV_LAB"],x["AS_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["AS_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["AS_ENV_LAB_FLAG"],x["AS_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["CA"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CA_ENV_LAB"],x["CA_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["CD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CD_ENV_LAB"],x["CD_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["CD_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CD_ENV_LAB_FLAG"],x["CD_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["CL"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CL_ENV_LAB"],x["CL_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["CL_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CL_ENV_LAB_FLAG"],x["CL_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["COND_FIELD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["COND_CLIENT_FLD"],x["COND_CLIENT_FLD_1"],x["COND_FIELD_ENV"],x["COND_FIELD_ENV_1"]],x["DATE"],x["STATION_ID"],["Client Field","Client Field.1","ENV Field","ENV Field.1"],np.nan),axis=1)
nb_surface_water_monitoring_p3["COND"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["COND_ENV_LAB"],x["COND_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["CR"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CR_ENV_LAB"],x["CR_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["CR_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CR_ENV_LAB_FLAG"],x["CR_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["CU"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CU_ENV_LAB"],x["CU_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["CU_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["CU_ENV_LAB_FLAG"],x["CU_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["DO_FIELD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["DO_CLIENT_FLD"],x["DO_CLIENT_FLD_1"],x["DO_FIELD_ENV"],x["DO_FIELD_ENV_1"]],x["DATE"],x["STATION_ID"],["Client Field","Client Field.1","ENV Field","ENV Field.1"],np.nan),axis=1)
nb_surface_water_monitoring_p3["E_COLI_MPN"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["E_COLI_MPN_ENV_LAB"],x["E_COLI_MPN_ENV_LAB_1"],x["E_COLI_MPN_ENV_LAB_2"],x["E_COLI_MPN_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","ENV Lab.1","ENV Lab.2","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["E_COLI_MPN_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["E_COLI_MPN_ENV_LAB_FLAG"],x["E_COLI_MPN_ENV_LAB_FLAG_1"],x["E_COLI_MPN_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","ENV Lab.1","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["F"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["F_ENV_LAB"],x["F_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["F_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["F_ENV_LAB_FLAG"],x["F_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["FE"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["FE_ENV_LAB"],x["FE_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["FE_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["FE_ENV_LAB_FLAG"],x["FE_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["HARD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["HARD_ENV_LAB"],x["HARD_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["K"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["K_ENV_LAB"],x["K_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["MG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["MG_ENV_LAB"],x["MG_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["MN"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["MN_ENV_LAB"],x["MN_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["MN_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["MN_ENV_LAB_FLAG"],x["MN_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["NA"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NA_ENV_LAB"],x["NA_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["NH3T"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NH3T_ENV_LAB"],x["NH3T_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["NH3T_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NH3T_ENV_LAB_FLAG"],x["NH3T_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["NI"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NI_ENV_LAB"],x["NI_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["NI_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NI_ENV_LAB_FLAG"],x["NI_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["NO2"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NO2_ENV_LAB"],x["NO2_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["NO2_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NO2_ENV_LAB_FLAG"],x["NO2_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["NO3"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NO3_ENV_LAB"],x["NO3_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["NO3_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NO3_ENV_LAB_FLAG"],x["NO3_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["NOX"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NOX_ENV_LAB"],x["NOX_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["NOX_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["NOX_ENV_LAB_FLAG"],x["NOX_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["PB"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["PB_ENV_LAB"],x["PB_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["PB_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["PB_ENV_LAB_FLAG"],x["PB_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["PH_FIELD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["PH_CLIENT_FLD"],x["PH_FIELD_ENV"],x["PH_FIELD_ENV_1"]],x["DATE"],x["STATION_ID"],["Client Field","ENV Field","ENV Field.1"],np.nan),axis=1)
nb_surface_water_monitoring_p3["PH"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["PH_ENV_LAB"],x["PH_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["SB"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["SB_ENV_LAB"],x["SB_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["SB_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["SB_ENV_LAB_FLAG"],x["SB_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["SO4"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["SO4_ENV_LAB"],x["SO4_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["SO4_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["SO4_ENV_LAB_FLAG"],x["SO4_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["TDS_CALC"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TDS_ENV_LAB_CALC"],x["TDS_RPC_LAB_CALC"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TEMP_FIELD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TEMP_CLIENT_FLD"],x["TEMP_CLIENT_FLD_1"],x["TEMP_FIELD_ENV"],x["TEMP_FIELD_ENV_1"],x["TEMP_FIELD_ENV_2"],x["TEMP_UNKNOWN"]],x["DATE"],x["STATION_ID"],["Client Field","Client Field.1","ENV Field","ENV Field.1","ENV Field.2","Unknown"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TKN"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TKN_RPC_LAB"],x["TKN_RPC_LAB_1"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.1"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TKN_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TKN_RPC_LAB_FLAG"],x["TKN_RPC_LAB_FLAG_1"]],x["DATE"],x["STATION_ID"],["RPC Lab","RPC Lab.1"],""),axis=1)
nb_surface_water_monitoring_p3["TN"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TN_ENV_LAB"],x["TN_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TN_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TN_ENV_LAB_FLAG"],x["TN_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["TOC"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TOC_ENV_LAB"],x["TOC_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TP"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TP_L_ENV_LAB"],x["TP_L_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TP_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TP_L_ENV_LAB_FLAG"],x["TP_L_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["TURB"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TURB_ENV_LAB"],x["TURB_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["TURB_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TURB_ENV_LAB_FLAG"],x["TURB_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)
nb_surface_water_monitoring_p3["TURB_FIELD"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["TURB_FIELD_ENV"],x["TURB_UNKNOWN"]],x["DATE"],x["STATION_ID"],["ENV Field","Unknown"],np.nan),axis=1)
nb_surface_water_monitoring_p3["ZN"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["ZN_ENV_LAB"],x["ZN_RPC_LAB"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],np.nan),axis=1)
nb_surface_water_monitoring_p3["ZN_FLAG"] = nb_surface_water_monitoring_p3.apply(lambda x: check_null_groups([x["ZN_ENV_LAB_FLAG"],x["ZN_RPC_LAB_FLAG"]],x["DATE"],x["STATION_ID"],["ENV Lab","RPC Lab"],""),axis=1)

In [None]:
#Separate source columns

def separate_source_columns(raw_value):
    #print(raw_value)
    if(pd.isnull(raw_value)==False):
        #print(raw_value)
        source_only = re.findall('\((.*?)\)',raw_value)
        source_only = source_only[0] if len(source_only) > 0 else source_only
        #print(source_only)
        return source_only
    else:
        return "" 
        
separate_source_columns_vec = np.vectorize(separate_source_columns)

In [None]:
nb_surface_water_monitoring_p3["AL_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["AL"])
nb_surface_water_monitoring_p3["AIR_TEMP_FIELD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["AIR_TEMP_FIELD"])
nb_surface_water_monitoring_p3["ALK_T_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["ALK_T"])
nb_surface_water_monitoring_p3["AS_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["AS"])
nb_surface_water_monitoring_p3["CA_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["CA"])
nb_surface_water_monitoring_p3["CD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["CD"])
nb_surface_water_monitoring_p3["CL_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["CL"])
nb_surface_water_monitoring_p3["COND_FIELD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["COND_FIELD"])
nb_surface_water_monitoring_p3["COND_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["COND"])
nb_surface_water_monitoring_p3["CR_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["CR"])
nb_surface_water_monitoring_p3["CU_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["CU"])
nb_surface_water_monitoring_p3["DO_FIELD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["DO_FIELD"])
nb_surface_water_monitoring_p3["E_COLI_MPN_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["E_COLI_MPN"])
nb_surface_water_monitoring_p3["F_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["F"])
nb_surface_water_monitoring_p3["FE_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["FE"])
nb_surface_water_monitoring_p3["HARD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["HARD"])
nb_surface_water_monitoring_p3["K_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["K"])
nb_surface_water_monitoring_p3["MG_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["MG"])
nb_surface_water_monitoring_p3["MN_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["MN"])
nb_surface_water_monitoring_p3["NA_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["NA"])
nb_surface_water_monitoring_p3["NH3T_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["NH3T"])
nb_surface_water_monitoring_p3["NI_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["NI"])
nb_surface_water_monitoring_p3["NO2_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["NO2"])
nb_surface_water_monitoring_p3["NO3_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["NO3"])
nb_surface_water_monitoring_p3["NOX_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["NOX"])
nb_surface_water_monitoring_p3["PB_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["PB"])
nb_surface_water_monitoring_p3["PH_FIELD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["PH_FIELD"])
nb_surface_water_monitoring_p3["PH_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["PH"])
nb_surface_water_monitoring_p3["SB_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["SB"])
nb_surface_water_monitoring_p3["SO4_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["SO4"])
nb_surface_water_monitoring_p3["TDS_CALC_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TDS_CALC"])
nb_surface_water_monitoring_p3["TEMP_FIELD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TEMP_FIELD"])
nb_surface_water_monitoring_p3["TKN_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TKN"])
nb_surface_water_monitoring_p3["TN_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TN"])
nb_surface_water_monitoring_p3["TOC_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TOC"])
nb_surface_water_monitoring_p3["TP_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TP"])
nb_surface_water_monitoring_p3["TURB_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TURB"])
nb_surface_water_monitoring_p3["TURB_FIELD_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["TURB_FIELD"])
nb_surface_water_monitoring_p3["ZN_SOURCE"] = separate_source_columns_vec(nb_surface_water_monitoring_p3["ZN"])

In [None]:
#Remove source data from analytes

def remove_source_name(raw_value):
    if(pd.isnull(raw_value)==False):
        #print(raw_value)
        value_cleaned = re.sub('\((.*?)\)','',raw_value)
        #print(value_cleaned)
        return value_cleaned
    else:
        return ""  
        
remove_source_name_vec = np.vectorize(remove_source_name)

In [None]:
nb_surface_water_monitoring_p3["AL"] = remove_source_name_vec(nb_surface_water_monitoring_p3["AL"])
nb_surface_water_monitoring_p3["AIR_TEMP_FIELD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["AIR_TEMP_FIELD"])
nb_surface_water_monitoring_p3["ALK_T"] = remove_source_name_vec(nb_surface_water_monitoring_p3["ALK_T"])
nb_surface_water_monitoring_p3["ALK_T_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["ALK_T_FLAG"])
nb_surface_water_monitoring_p3["AS"] = remove_source_name_vec(nb_surface_water_monitoring_p3["AS"])
nb_surface_water_monitoring_p3["AS_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["AS_FLAG"])
nb_surface_water_monitoring_p3["CA"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CA"])
nb_surface_water_monitoring_p3["CD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CD"])
nb_surface_water_monitoring_p3["CD_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CD_FLAG"])
nb_surface_water_monitoring_p3["CL"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CL"])
nb_surface_water_monitoring_p3["CL_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CL_FLAG"])
nb_surface_water_monitoring_p3["COND_FIELD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["COND_FIELD"])
nb_surface_water_monitoring_p3["COND"] = remove_source_name_vec(nb_surface_water_monitoring_p3["COND"])
nb_surface_water_monitoring_p3["CR"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CR"])
nb_surface_water_monitoring_p3["CR_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CR_FLAG"])
nb_surface_water_monitoring_p3["CU"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CU"])
nb_surface_water_monitoring_p3["CU_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["CU_FLAG"])
nb_surface_water_monitoring_p3["DO_FIELD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["DO_FIELD"])
nb_surface_water_monitoring_p3["E_COLI_MPN"] = remove_source_name_vec(nb_surface_water_monitoring_p3["E_COLI_MPN"])
nb_surface_water_monitoring_p3["E_COLI_MPN_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["E_COLI_MPN_FLAG"])
nb_surface_water_monitoring_p3["F"] = remove_source_name_vec(nb_surface_water_monitoring_p3["F"])
nb_surface_water_monitoring_p3["F_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["F_FLAG"])
nb_surface_water_monitoring_p3["FE"] = remove_source_name_vec(nb_surface_water_monitoring_p3["FE"])
nb_surface_water_monitoring_p3["FE_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["FE_FLAG"])
nb_surface_water_monitoring_p3["HARD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["HARD"])
nb_surface_water_monitoring_p3["K"] = remove_source_name_vec(nb_surface_water_monitoring_p3["K"])
nb_surface_water_monitoring_p3["MG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["MG"])
nb_surface_water_monitoring_p3["MN"] = remove_source_name_vec(nb_surface_water_monitoring_p3["MN"])
nb_surface_water_monitoring_p3["MN_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["MN_FLAG"])
nb_surface_water_monitoring_p3["NA"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NA"])
nb_surface_water_monitoring_p3["NH3T"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NH3T"])
nb_surface_water_monitoring_p3["NH3T_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NH3T_FLAG"])
nb_surface_water_monitoring_p3["NI"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NI"])
nb_surface_water_monitoring_p3["NI_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NI_FLAG"])
nb_surface_water_monitoring_p3["NO2"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NO2"])
nb_surface_water_monitoring_p3["NO2_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NO2_FLAG"])
nb_surface_water_monitoring_p3["NO3"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NO3"])
nb_surface_water_monitoring_p3["NO3_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NO3_FLAG"])
nb_surface_water_monitoring_p3["NOX"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NOX"])
nb_surface_water_monitoring_p3["NOX_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["NOX_FLAG"])
nb_surface_water_monitoring_p3["PB"] = remove_source_name_vec(nb_surface_water_monitoring_p3["PB"])
nb_surface_water_monitoring_p3["PB_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["PB_FLAG"])
nb_surface_water_monitoring_p3["PH_FIELD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["PH_FIELD"])
nb_surface_water_monitoring_p3["PH"] = remove_source_name_vec(nb_surface_water_monitoring_p3["PH"])
nb_surface_water_monitoring_p3["SB"] = remove_source_name_vec(nb_surface_water_monitoring_p3["SB"])
nb_surface_water_monitoring_p3["SB_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["SB_FLAG"])
nb_surface_water_monitoring_p3["SO4"] = remove_source_name_vec(nb_surface_water_monitoring_p3["SO4"])
nb_surface_water_monitoring_p3["SO4_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["SO4_FLAG"])
nb_surface_water_monitoring_p3["TDS_CALC"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TDS_CALC"])
nb_surface_water_monitoring_p3["TEMP_FIELD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TEMP_FIELD"])
nb_surface_water_monitoring_p3["TKN"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TKN"])
nb_surface_water_monitoring_p3["TKN_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TKN_FLAG"])
nb_surface_water_monitoring_p3["TN"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TN"])
nb_surface_water_monitoring_p3["TN_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TN_FLAG"])
nb_surface_water_monitoring_p3["TOC"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TOC"])
nb_surface_water_monitoring_p3["TP"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TP"])
nb_surface_water_monitoring_p3["TP_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TP_FLAG"])
nb_surface_water_monitoring_p3["TURB"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TURB"])
nb_surface_water_monitoring_p3["TURB_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TURB_FLAG"])
nb_surface_water_monitoring_p3["TURB_FIELD"] = remove_source_name_vec(nb_surface_water_monitoring_p3["TURB_FIELD"])
nb_surface_water_monitoring_p3["ZN"] = remove_source_name_vec(nb_surface_water_monitoring_p3["ZN"])
nb_surface_water_monitoring_p3["ZN_FLAG"] = remove_source_name_vec(nb_surface_water_monitoring_p3["ZN_FLAG"])

In [None]:
nb_surface_water_monitoring_p3.columns.tolist()

In [None]:
#Rename some flag columns

nb_surface_water_monitoring_p3 = nb_surface_water_monitoring_p3.rename(columns={"CA_ENV_LAB_FLAG":"CA_FLAG", "COND_ENV_LAB_FLAG":"COND_FLAG", "HARD_ENV_LAB_FLAG":"HARD_FLAG", "PH_ENV_LAB_FLAG":"PH_FLAG", "TOC_ENV_LAB_FLAG":"TOC_FLAG"})



In [None]:
#Create Missing Flag Columns

nb_surface_water_monitoring_p3['AIR_TEMP_FIELD_FLAG'] = ""
nb_surface_water_monitoring_p3['AL_FLAG'] = ""
nb_surface_water_monitoring_p3['AN_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['B_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['B_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['BA_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['C_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['C03_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['CAT_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['CO3_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['COND_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['COND_FIELD_FLAG'] = ""
nb_surface_water_monitoring_p3['DIFB_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['DIFC_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['DIFTDS_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['DO_FIELD_FLAG'] = ""
nb_surface_water_monitoring_p3['DOC_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['HCO3_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['HCO3_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['K_FLAG'] = ""
nb_surface_water_monitoring_p3['LI_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['MG_FLAG'] = ""
nb_surface_water_monitoring_p3['NA_FLAG'] = ""
nb_surface_water_monitoring_p3['OH_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['PH_FIELD_FLAG'] = ""
nb_surface_water_monitoring_p3['RB_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['SALINITY_FIELD_ENV_FLAG'] = ""
nb_surface_water_monitoring_p3['SIN_ENV_LAB_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['SR_RPC_LAB_FLAG'] = ""
nb_surface_water_monitoring_p3['TDS_CALC_FLAG'] = ""
nb_surface_water_monitoring_p3['TEMP_FIELD_FLAG'] = ""
nb_surface_water_monitoring_p3['TURB_FIELD_FLAG'] = ""

In [None]:
nb_surface_water_monitoring_p3.columns.tolist()

In [None]:
cols = ['AS_ENV_LAB_FLAG', 'CA_FLAG', 'CD_ENV_LAB_FLAG', 'CLRA_ENV_LAB_FLAG', 'CR_ENV_LAB_FLAG', 'CU_ENV_LAB_FLAG', 'E_COLI_MPN_ENV_LAB_FLAG', 'F_ENV_LAB_FLAG', 'FE_ENV_LAB_FLAG', 'MN_ENV_LAB_FLAG', 'NH3T_ENV_LAB_FLAG', 'NI_ENV_LAB_FLAG', 'NO2_ENV_LAB_FLAG', 'NO3_ENV_LAB_FLAG', 'NOX_ENV_LAB_FLAG', 'PB_ENV_LAB_FLAG', 'SB_ENV_LAB_FLAG', 'SS_ENV_LAB_FLAG', 'TN_ENV_LAB_FLAG', 'TOC_FLAG', 'TP_L_ENV_LAB_FLAG', 'TURB_ENV_LAB_FLAG', 'ZN_ENV_LAB_FLAG', 'ALK_G_ENV_LAB_FLAG', 'ALK_T_ENV_LAB_FLAG', 'CL_ENV_LAB_FLAG', 'COND_FLAG', 'HARD_FLAG', 'PH_FLAG', 'SO4_ENV_LAB_FLAG', 'AG_RPC_LAB_FLAG', 'ALK_T_RPC_LAB_FLAG', 'AS_RPC_LAB_FLAG', 'BE_X_RPC_LAB_FLAG', 'BI_RPC_LAB_FLAG', 'BR2_RPC_LAB_FLAG', 'CD_RPC_LAB_FLAG', 'CL_RPC_LAB_FLAG', 'CLRT_RPC_LAB_FLAG', 'CO_RPC_LAB_FLAG', 'CR_RPC_LAB_FLAG', 'CU_RPC_LAB_FLAG', 'E_COLI_MPN_RPC_LAB_FLAG', 'F_RPC_LAB_FLAG', 'FE_RPC_LAB_FLAG', 'MN_RPC_LAB_FLAG', 'MO_RPC_LAB_FLAG', 'NH3_UN_ION_RPC_LAB_FLAG', 'NH3T_RPC_LAB_FLAG', 'NI_RPC_LAB_FLAG', 'NO2_RPC_LAB_FLAG', 'NO3_RPC_LAB_FLAG', 'NOX_RPC_LAB_FLAG', 'PB_RPC_LAB_FLAG', 'SB_RPC_LAB_FLAG', 'SE_RPC_LAB_FLAG', 'SN_RPC_LAB_FLAG', 'SO4_RPC_LAB_FLAG', 'TC_MPN_RPC_LAB_FLAG', 'TE_RPC_LAB_FLAG', 'TKN_RPC_LAB_FLAG', 'TL_RPC_LAB_FLAG', 'TN_RPC_LAB_FLAG', 'TP_L_RPC_LAB_FLAG', 'TURB_RPC_LAB_FLAG', 'U_RPC_LAB_FLAG', 'V_RPC_LAB_FLAG', 'ZN_RPC_LAB_FLAG', 'ALK_T_FLAG', 'AS_FLAG', 'CD_FLAG', 'CL_FLAG', 'CR_FLAG', 'CU_FLAG', 'E_COLI_MPN_FLAG', 'F_FLAG', 'FE_FLAG', 'MN_FLAG', 'NH3T_FLAG', 'NI_FLAG', 'NO2_FLAG', 'NO3_FLAG', 'NOX_FLAG', 'PB_FLAG', 'SB_FLAG', 'SO4_FLAG', 'TKN_FLAG', 'TN_FLAG', 'TP_FLAG', 'TURB_FLAG', 'ZN_FLAG', 'AIR_TEMP_FIELD_FLAG', 'AL_FLAG', 'AN_ENV_LAB_CALC_FLAG', 'B_ENV_LAB_CALC_FLAG', 'B_RPC_LAB_FLAG', 'BA_RPC_LAB_FLAG', 'C_ENV_LAB_CALC_FLAG', 'C03_RPC_LAB_FLAG', 'CAT_ENV_LAB_CALC_FLAG', 'CO3_ENV_LAB_CALC_FLAG', 'COND_ENV_LAB_CALC_FLAG', 'COND_FIELD_FLAG', 'DIFB_ENV_LAB_CALC_FLAG', 'DIFC_ENV_LAB_CALC_FLAG', 'DIFTDS_ENV_LAB_CALC_FLAG', 'DO_FIELD_FLAG', 'DOC_RPC_LAB_FLAG', 'HCO3_ENV_LAB_CALC_FLAG', 'HCO3_RPC_LAB_FLAG', 'K_FLAG', 'LI_RPC_LAB_FLAG', 'MG_FLAG', 'NA_FLAG', 'OH_ENV_LAB_CALC_FLAG', 'PH_FIELD_FLAG', 'RB_RPC_LAB_FLAG', 'SALINITY_FIELD_ENV_FLAG', 'SIN_ENV_LAB_CALC_FLAG', 'SR_RPC_LAB_FLAG', 'TDS_CALC_FLAG', 'TEMP_FIELD_FLAG', 'TURB_FIELD_FLAG']

nb_surface_water_monitoring_p3[cols] = nb_surface_water_monitoring_p3[cols].replace(np.nan,"")

In [None]:
#Empty String to NaN

cols = ['AG_RPC_LAB', 'AIR_TEMP_FIELD', 'AL', 'ALK_G_ENV_LAB', 'ALK_T', 'AN_ENV_LAB_CALC', 'AS', 'B_ENV_LAB_CALC', 'B_RPC_LAB', 'BA_RPC_LAB', 'BE_X_RPC_LAB', 'BI_RPC_LAB', 'BR2_RPC_LAB', 'C_ENV_LAB_CALC', 'C03_RPC_LAB', 'CA', 'CAT_ENV_LAB_CALC', 'CD', 'CL', 'CLRA_ENV_LAB', 'CLRT_RPC_LAB', 'CO_RPC_LAB', 'CO3_ENV_LAB_CALC', 'COND', 'COND_ENV_LAB_CALC', 'COND_FIELD', 'CR', 'CU', 'DIFB_ENV_LAB_CALC', 'DIFC_ENV_LAB_CALC', 'DIFTDS_ENV_LAB_CALC', 'DO_FIELD', 'DOC_RPC_LAB', 'E_COLI_MPN', 'F', 'FE', 'HARD', 'HCO3_ENV_LAB_CALC', 'HCO3_RPC_LAB', 'K', 'LI_RPC_LAB', 'MG', 'MN', 'MO_RPC_LAB', 'NA', 'NH3_UN_ION_RPC_LAB', 'NH3T', 'NI', 'NO2', 'NO3', 'NOX', 'OH_ENV_LAB_CALC', 'PB', 'PH', 'PH_FIELD', 'RB_RPC_LAB', 'SALINITY_FIELD_ENV', 'SB', 'SIN_ENV_LAB_CALC', 'SN_RPC_LAB', 'SO4', 'SR_RPC_LAB', 'SS_ENV_LAB', 'TC_MPN_RPC_LAB', 'TDS_CALC', 'TE_RPC_LAB', 'TEMP_FIELD', 'TKN', 'TL_RPC_LAB', 'TN', 'TOC', 'TP', 'TURB', 'TURB_FIELD', 'U_RPC_LAB', 'V_RPC_LAB', 'ZN']
nb_surface_water_monitoring_p3[cols] = nb_surface_water_monitoring_p3[cols].replace("",np.nan)

In [None]:

def create_missing_flag_col(non_numeric_value,flag_val):
    if(pd.isnull(non_numeric_value)==False and (bool(re.search("^-?[0-9]\d*(\.\d+)?$", str(non_numeric_value)))==False and non_numeric_value !=np.nan)):
        return non_numeric_value
            
    return flag_val

create_missing_flag_col_vec = np.vectorize(create_missing_flag_col)

In [None]:
nb_surface_water_monitoring_p3['AG_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["AG_RPC_LAB"],nb_surface_water_monitoring_p3['AG_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['AIR_TEMP_FIELD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["AIR_TEMP_FIELD"],nb_surface_water_monitoring_p3['AIR_TEMP_FIELD_FLAG'])
nb_surface_water_monitoring_p3['AL_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["AL"],nb_surface_water_monitoring_p3['AL_FLAG'])
nb_surface_water_monitoring_p3['ALK_G_ENV_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["ALK_G_ENV_LAB"],nb_surface_water_monitoring_p3['ALK_G_ENV_LAB_FLAG'])
nb_surface_water_monitoring_p3['ALK_T_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["ALK_T"],nb_surface_water_monitoring_p3['ALK_T_FLAG'])
nb_surface_water_monitoring_p3['AN_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["AN_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['AN_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['AS_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["AS"],nb_surface_water_monitoring_p3['AS_FLAG'])
nb_surface_water_monitoring_p3['B_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["B_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['B_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['B_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["B_RPC_LAB"],nb_surface_water_monitoring_p3['B_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['BA_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["BA_RPC_LAB"],nb_surface_water_monitoring_p3['BA_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['BE_X_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["BE_X_RPC_LAB"],nb_surface_water_monitoring_p3['BE_X_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['BI_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["BI_RPC_LAB"],nb_surface_water_monitoring_p3['BI_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['BR2_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["BR2_RPC_LAB"],nb_surface_water_monitoring_p3['BR2_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['C_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["C_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['C_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['C03_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["C03_RPC_LAB"],nb_surface_water_monitoring_p3['C03_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['CA_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CA"],nb_surface_water_monitoring_p3['CA_FLAG'])
nb_surface_water_monitoring_p3['CAT_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CAT_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['CAT_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['CD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CD"],nb_surface_water_monitoring_p3['CD_FLAG'])
nb_surface_water_monitoring_p3['CL_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CL"],nb_surface_water_monitoring_p3['CL_FLAG'])
nb_surface_water_monitoring_p3['CLRA_ENV_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CLRA_ENV_LAB"],nb_surface_water_monitoring_p3['CLRA_ENV_LAB_FLAG'])
nb_surface_water_monitoring_p3['CLRT_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CLRT_RPC_LAB"],nb_surface_water_monitoring_p3['CLRT_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['CO_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CO_RPC_LAB"],nb_surface_water_monitoring_p3['CO_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['CO3_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CO3_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['CO3_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['COND_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["COND_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['COND_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['COND_FIELD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["COND_FIELD"],nb_surface_water_monitoring_p3['COND_FIELD_FLAG'])
nb_surface_water_monitoring_p3['COND_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["COND"],nb_surface_water_monitoring_p3['COND_FLAG'])
nb_surface_water_monitoring_p3['CR_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CR"],nb_surface_water_monitoring_p3['CR_FLAG'])
nb_surface_water_monitoring_p3['CU_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["CU"],nb_surface_water_monitoring_p3['CU_FLAG'])
nb_surface_water_monitoring_p3['DIFB_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["DIFB_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['DIFB_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['DIFC_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["DIFC_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['DIFC_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['DIFTDS_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["DIFTDS_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['DIFTDS_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['DO_FIELD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["DO_FIELD"],nb_surface_water_monitoring_p3['DO_FIELD_FLAG'])
nb_surface_water_monitoring_p3['DOC_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["DOC_RPC_LAB"],nb_surface_water_monitoring_p3['DOC_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['E_COLI_MPN_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["E_COLI_MPN"],nb_surface_water_monitoring_p3['E_COLI_MPN_FLAG'])
nb_surface_water_monitoring_p3['F_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["F"],nb_surface_water_monitoring_p3['F_FLAG'])
nb_surface_water_monitoring_p3['FE_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["FE"],nb_surface_water_monitoring_p3['FE_FLAG'])
nb_surface_water_monitoring_p3['HARD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["HARD"],nb_surface_water_monitoring_p3['HARD_FLAG'])
nb_surface_water_monitoring_p3['HCO3_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["HCO3_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['HCO3_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['HCO3_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["HCO3_RPC_LAB"],nb_surface_water_monitoring_p3['HCO3_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['K_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["K"],nb_surface_water_monitoring_p3['K_FLAG'])
nb_surface_water_monitoring_p3['LI_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["LI_RPC_LAB"],nb_surface_water_monitoring_p3['LI_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['MG_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["MG"],nb_surface_water_monitoring_p3['MG_FLAG'])
nb_surface_water_monitoring_p3['MN_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["MN"],nb_surface_water_monitoring_p3['MN_FLAG'])
nb_surface_water_monitoring_p3['MO_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["MO_RPC_LAB"],nb_surface_water_monitoring_p3['MO_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['NA_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NA"],nb_surface_water_monitoring_p3['NA_FLAG'])
nb_surface_water_monitoring_p3['NH3_UN_ION_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NH3_UN_ION_RPC_LAB"],nb_surface_water_monitoring_p3['NH3_UN_ION_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['NH3T_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NH3T"],nb_surface_water_monitoring_p3['NH3T_FLAG'])
nb_surface_water_monitoring_p3['NI_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NI"],nb_surface_water_monitoring_p3['NI_FLAG'])
nb_surface_water_monitoring_p3['NO2_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NO2"],nb_surface_water_monitoring_p3['NO2_FLAG'])
nb_surface_water_monitoring_p3['NO3_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NO3"],nb_surface_water_monitoring_p3['NO3_FLAG'])
nb_surface_water_monitoring_p3['NOX_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["NOX"],nb_surface_water_monitoring_p3['NOX_FLAG'])
nb_surface_water_monitoring_p3['OH_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["OH_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['OH_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['PB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["PB"],nb_surface_water_monitoring_p3['PB_FLAG'])
nb_surface_water_monitoring_p3['PH_FIELD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["PH_FIELD"],nb_surface_water_monitoring_p3['PH_FIELD_FLAG'])
nb_surface_water_monitoring_p3['PH_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["PH"],nb_surface_water_monitoring_p3['PH_FLAG'])
nb_surface_water_monitoring_p3['RB_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["RB_RPC_LAB"],nb_surface_water_monitoring_p3['RB_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['SALINITY_FIELD_ENV_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SALINITY_FIELD_ENV"],nb_surface_water_monitoring_p3['SALINITY_FIELD_ENV_FLAG'])
nb_surface_water_monitoring_p3['SB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SB"],nb_surface_water_monitoring_p3['SB_FLAG'])
nb_surface_water_monitoring_p3['SIN_ENV_LAB_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SIN_ENV_LAB_CALC"],nb_surface_water_monitoring_p3['SIN_ENV_LAB_CALC_FLAG'])
nb_surface_water_monitoring_p3['SN_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SN_RPC_LAB"],nb_surface_water_monitoring_p3['SN_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['SO4_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SO4"],nb_surface_water_monitoring_p3['SO4_FLAG'])
nb_surface_water_monitoring_p3['SR_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SR_RPC_LAB"],nb_surface_water_monitoring_p3['SR_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['SS_ENV_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["SS_ENV_LAB"],nb_surface_water_monitoring_p3['SS_ENV_LAB_FLAG'])
nb_surface_water_monitoring_p3['TC_MPN_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TC_MPN_RPC_LAB"],nb_surface_water_monitoring_p3['TC_MPN_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['TDS_CALC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TDS_CALC"],nb_surface_water_monitoring_p3['TDS_CALC_FLAG'])
nb_surface_water_monitoring_p3['TE_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TE_RPC_LAB"],nb_surface_water_monitoring_p3['TE_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['TEMP_FIELD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TEMP_FIELD"],nb_surface_water_monitoring_p3['TEMP_FIELD_FLAG'])
nb_surface_water_monitoring_p3['TKN_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TKN"],nb_surface_water_monitoring_p3['TKN_FLAG'])
nb_surface_water_monitoring_p3['TL_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TL_RPC_LAB"],nb_surface_water_monitoring_p3['TL_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['TN_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TN"],nb_surface_water_monitoring_p3['TN_FLAG'])
nb_surface_water_monitoring_p3['TOC_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TOC"],nb_surface_water_monitoring_p3['TOC_FLAG'])
nb_surface_water_monitoring_p3['TP_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TP"],nb_surface_water_monitoring_p3['TP_FLAG'])
nb_surface_water_monitoring_p3['TURB_FIELD_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TURB_FIELD"],nb_surface_water_monitoring_p3['TURB_FIELD_FLAG'])
nb_surface_water_monitoring_p3['TURB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["TURB"],nb_surface_water_monitoring_p3['TURB_FLAG'])
nb_surface_water_monitoring_p3['U_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["U_RPC_LAB"],nb_surface_water_monitoring_p3['U_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['V_RPC_LAB_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["V_RPC_LAB"],nb_surface_water_monitoring_p3['V_RPC_LAB_FLAG'])
nb_surface_water_monitoring_p3['ZN_FLAG'] = create_missing_flag_col_vec(nb_surface_water_monitoring_p3["ZN"],nb_surface_water_monitoring_p3['ZN_FLAG'])

In [None]:
nb_surface_water_monitoring_p4 = nb_surface_water_monitoring_p3.copy()

In [None]:
#Remove RPV from analyte columns

nb_surface_water_monitoring_p4[cols] = nb_surface_water_monitoring_p4[cols].replace("RPV",np.nan)
nb_surface_water_monitoring_p4[cols] = nb_surface_water_monitoring_p4[cols].astype(float)

In [None]:
#Convert empty srting to nan values for flags 

cols = ['AS_ENV_LAB_FLAG', 'CA_FLAG', 'CD_ENV_LAB_FLAG', 'CLRA_ENV_LAB_FLAG', 'CR_ENV_LAB_FLAG', 'CU_ENV_LAB_FLAG', 'E_COLI_MPN_ENV_LAB_FLAG', 'F_ENV_LAB_FLAG', 'FE_ENV_LAB_FLAG', 'MN_ENV_LAB_FLAG', 'NH3T_ENV_LAB_FLAG', 'NI_ENV_LAB_FLAG', 'NO2_ENV_LAB_FLAG', 'NO3_ENV_LAB_FLAG', 'NOX_ENV_LAB_FLAG', 'PB_ENV_LAB_FLAG', 'SB_ENV_LAB_FLAG', 'SS_ENV_LAB_FLAG', 'TN_ENV_LAB_FLAG', 'TOC_FLAG', 'TP_L_ENV_LAB_FLAG', 'TURB_ENV_LAB_FLAG', 'ZN_ENV_LAB_FLAG', 'ALK_G_ENV_LAB_FLAG', 'ALK_T_ENV_LAB_FLAG', 'CL_ENV_LAB_FLAG', 'COND_FLAG', 'HARD_FLAG', 'PH_FLAG', 'SO4_ENV_LAB_FLAG', 'AG_RPC_LAB_FLAG', 'ALK_T_RPC_LAB_FLAG', 'AS_RPC_LAB_FLAG', 'BE_X_RPC_LAB_FLAG', 'BI_RPC_LAB_FLAG', 'BR2_RPC_LAB_FLAG', 'CD_RPC_LAB_FLAG', 'CL_RPC_LAB_FLAG', 'CLRT_RPC_LAB_FLAG', 'CO_RPC_LAB_FLAG', 'CR_RPC_LAB_FLAG', 'CU_RPC_LAB_FLAG', 'E_COLI_MPN_RPC_LAB_FLAG', 'F_RPC_LAB_FLAG', 'FE_RPC_LAB_FLAG', 'MN_RPC_LAB_FLAG', 'MO_RPC_LAB_FLAG', 'NH3_UN_ION_RPC_LAB_FLAG', 'NH3T_RPC_LAB_FLAG', 'NI_RPC_LAB_FLAG', 'NO2_RPC_LAB_FLAG', 'NO3_RPC_LAB_FLAG', 'NOX_RPC_LAB_FLAG', 'PB_RPC_LAB_FLAG', 'SB_RPC_LAB_FLAG', 'SE_RPC_LAB_FLAG', 'SN_RPC_LAB_FLAG', 'SO4_RPC_LAB_FLAG', 'TC_MPN_RPC_LAB_FLAG', 'TE_RPC_LAB_FLAG', 'TKN_RPC_LAB_FLAG', 'TL_RPC_LAB_FLAG', 'TN_RPC_LAB_FLAG', 'TP_L_RPC_LAB_FLAG', 'TURB_RPC_LAB_FLAG', 'U_RPC_LAB_FLAG', 'V_RPC_LAB_FLAG', 'ZN_RPC_LAB_FLAG', 'ALK_T_FLAG', 'AS_FLAG', 'CD_FLAG', 'CL_FLAG', 'CR_FLAG', 'CU_FLAG', 'E_COLI_MPN_FLAG', 'F_FLAG', 'FE_FLAG', 'MN_FLAG', 'NH3T_FLAG', 'NI_FLAG', 'NO2_FLAG', 'NO3_FLAG', 'NOX_FLAG', 'PB_FLAG', 'SB_FLAG', 'SO4_FLAG', 'TKN_FLAG', 'TN_FLAG', 'TP_FLAG', 'TURB_FLAG', 'ZN_FLAG', 'AIR_TEMP_FIELD_FLAG', 'AL_FLAG', 'AN_ENV_LAB_CALC_FLAG', 'B_ENV_LAB_CALC_FLAG', 'B_RPC_LAB_FLAG', 'BA_RPC_LAB_FLAG', 'C_ENV_LAB_CALC_FLAG', 'C03_RPC_LAB_FLAG', 'CAT_ENV_LAB_CALC_FLAG', 'CO3_ENV_LAB_CALC_FLAG', 'COND_ENV_LAB_CALC_FLAG', 'COND_FIELD_FLAG', 'DIFB_ENV_LAB_CALC_FLAG', 'DIFC_ENV_LAB_CALC_FLAG', 'DIFTDS_ENV_LAB_CALC_FLAG', 'DO_FIELD_FLAG', 'DOC_RPC_LAB_FLAG', 'HCO3_ENV_LAB_CALC_FLAG', 'HCO3_RPC_LAB_FLAG', 'K_FLAG', 'LI_RPC_LAB_FLAG', 'MG_FLAG', 'NA_FLAG', 'OH_ENV_LAB_CALC_FLAG', 'PH_FIELD_FLAG', 'RB_RPC_LAB_FLAG', 'SALINITY_FIELD_ENV_FLAG', 'SIN_ENV_LAB_CALC_FLAG', 'SR_RPC_LAB_FLAG', 'TDS_CALC_FLAG', 'TEMP_FIELD_FLAG', 'TURB_FIELD_FLAG']

nb_surface_water_monitoring_p4[cols] = nb_surface_water_monitoring_p4[cols].replace("",np.nan)

In [None]:
#Drop null columns
nb_surface_water_monitoring_p4 = drop_empty_columns_dataset(nb_surface_water_monitoring_p4)

In [None]:
#Drop duplicate columns 
cols = ["AL_ENV_LAB", "AL_RPC_LAB", "AIR_TEMP_FIELD_ENV", "AIR_TEMP_FIELD_ENV_1", "ALK_T_ENV_LAB", "ALK_T_RPC_LAB", "ALK_T_ENV_LAB_FLAG", "ALK_T_RPC_LAB_FLAG", "AS_ENV_LAB", "AS_RPC_LAB", "AS_ENV_LAB_FLAG", "AS_RPC_LAB_FLAG", "CA_ENV_LAB", "CA_RPC_LAB", "CD_ENV_LAB", "CD_RPC_LAB", "CD_ENV_LAB_FLAG", "CD_RPC_LAB_FLAG", "CL_ENV_LAB", "CL_RPC_LAB", "CL_ENV_LAB_FLAG", "CL_RPC_LAB_FLAG", "COND_CLIENT_FLD", "COND_CLIENT_FLD_1", "COND_FIELD_ENV", "COND_FIELD_ENV_1", "COND_ENV_LAB", "COND_RPC_LAB", "CR_ENV_LAB", "CR_RPC_LAB", "CR_ENV_LAB_FLAG", "CR_RPC_LAB_FLAG", "CU_ENV_LAB", "CU_RPC_LAB", "CU_ENV_LAB_FLAG", "CU_RPC_LAB_FLAG", "DO_CLIENT_FLD", "DO_CLIENT_FLD_1", "DO_FIELD_ENV", "DO_FIELD_ENV_1", "E_COLI_MPN_ENV_LAB", "E_COLI_MPN_ENV_LAB_1", "E_COLI_MPN_ENV_LAB_2", "E_COLI_MPN_RPC_LAB", "E_COLI_MPN_ENV_LAB_FLAG", "E_COLI_MPN_ENV_LAB_FLAG_1", "E_COLI_MPN_RPC_LAB_FLAG", "F_ENV_LAB", "F_RPC_LAB", "F_ENV_LAB_FLAG", "F_RPC_LAB_FLAG", "FE_ENV_LAB", "FE_RPC_LAB", "FE_ENV_LAB_FLAG", "FE_RPC_LAB_FLAG", "HARD_ENV_LAB", "HARD_RPC_LAB", "K_ENV_LAB", "K_RPC_LAB", "MG_ENV_LAB", "MG_RPC_LAB", "MN_ENV_LAB", "MN_RPC_LAB", "MN_ENV_LAB_FLAG", "MN_RPC_LAB_FLAG", "NA_ENV_LAB", "NA_RPC_LAB", "NH3T_ENV_LAB", "NH3T_RPC_LAB", "NH3T_ENV_LAB_FLAG", "NH3T_RPC_LAB_FLAG", "NI_ENV_LAB", "NI_RPC_LAB", "NI_ENV_LAB_FLAG", "NI_RPC_LAB_FLAG", "NO2_ENV_LAB", "NO2_RPC_LAB", "NO2_ENV_LAB_FLAG", "NO2_RPC_LAB_FLAG", "NO3_ENV_LAB", "NO3_RPC_LAB", "NO3_ENV_LAB_FLAG", "NO3_RPC_LAB_FLAG", "NOX_ENV_LAB", "NOX_RPC_LAB", "NOX_ENV_LAB_FLAG", "NOX_RPC_LAB_FLAG", "PB_ENV_LAB", "PB_RPC_LAB", "PB_ENV_LAB_FLAG", "PB_RPC_LAB_FLAG", "PH_CLIENT_FLD", "PH_FIELD_ENV", "PH_FIELD_ENV_1", "PH_ENV_LAB", "PH_RPC_LAB", "SB_ENV_LAB", "SB_RPC_LAB", "SB_ENV_LAB_FLAG", "SB_RPC_LAB_FLAG", "SO4_ENV_LAB", "SO4_RPC_LAB", "SO4_ENV_LAB_FLAG", "SO4_RPC_LAB_FLAG", "TDS_ENV_LAB_CALC", "TDS_RPC_LAB_CALC", "TEMP_CLIENT_FLD", "TEMP_CLIENT_FLD_1", "TEMP_FIELD_ENV", "TEMP_FIELD_ENV_1", "TEMP_FIELD_ENV_2", "TEMP_UNKNOWN", "TKN_RPC_LAB", "TKN_RPC_LAB_1", "TKN_RPC_LAB_FLAG", "TKN_RPC_LAB_FLAG_1", "TN_ENV_LAB", "TN_RPC_LAB", "TN_ENV_LAB_FLAG", "TN_RPC_LAB_FLAG", "TOC_ENV_LAB", "TOC_RPC_LAB", "TP_L_ENV_LAB", "TP_L_RPC_LAB", "TP_L_ENV_LAB_FLAG", "TP_L_RPC_LAB_FLAG", "TURB_ENV_LAB", "TURB_RPC_LAB", "TURB_ENV_LAB_FLAG", "TURB_RPC_LAB_FLAG", "TURB_FIELD_ENV", "TURB_UNKNOWN", "ZN_ENV_LAB", "ZN_RPC_LAB", "ZN_ENV_LAB_FLAG", "ZN_RPC_LAB_FLAG"]

nb_surface_water_monitoring_p4 = nb_surface_water_monitoring_p4.drop(cols, axis=1)

In [None]:
#Rearrange columns

cols = ['STATION_NAME', 'DATE', 'FIELD_NUMBER', 'YEAR', 'STATION_ID', 'LATITUDE', 'LONGITUDE', 'AG_RPC_LAB', 'AG_RPC_LAB_FLAG', 'AIR_TEMP_FIELD', 'AIR_TEMP_FIELD_SOURCE', 'AL', 'AL_SOURCE', 'ALK_G_ENV_LAB', 'ALK_G_ENV_LAB_FLAG', 'ALK_T', 'ALK_T_FLAG', 'ALK_T_SOURCE', 'AN_ENV_LAB_CALC', 'AS', 'AS_FLAG', 'AS_SOURCE', 'B_ENV_LAB_CALC', 'B_RPC_LAB', 'BA_RPC_LAB', 'BE_X_RPC_LAB', 'BE_X_RPC_LAB_FLAG', 'BI_RPC_LAB', 'BI_RPC_LAB_FLAG', 'BR2_RPC_LAB', 'BR2_RPC_LAB_FLAG', 'C_ENV_LAB_CALC', 'C03_RPC_LAB', 'CA', 'CA_SOURCE', 'CAT_ENV_LAB_CALC', 'CD', 'CD_FLAG', 'CD_SOURCE', 'CL', 'CL_FLAG', 'CL_SOURCE', 'CLRA_ENV_LAB', 'CLRA_ENV_LAB_FLAG', 'CLRT_RPC_LAB', 'CLRT_RPC_LAB_FLAG', 'CO_RPC_LAB', 'CO_RPC_LAB_FLAG', 'CO3_ENV_LAB_CALC', 'COND', 'COND_FLAG', 'COND_SOURCE', 'COND_ENV_LAB_CALC', 'COND_FIELD', 'COND_FIELD_SOURCE', 'CR', 'CR_FLAG', 'CR_SOURCE', 'CU', 'CU_FLAG', 'CU_SOURCE', 'DIFB_ENV_LAB_CALC', 'DIFC_ENV_LAB_CALC', 'DIFTDS_ENV_LAB_CALC', 'DO_FIELD', 'DO_FIELD_FLAG', 'DO_FIELD_SOURCE', 'DOC_RPC_LAB', 'E_COLI_MPN', 'E_COLI_MPN_FLAG', 'E_COLI_MPN_SOURCE', 'F', 'F_FLAG', 'F_SOURCE', 'FE', 'FE_FLAG', 'FE_SOURCE', 'HARD', 'HARD_FLAG', 'HARD_SOURCE', 'HCO3_ENV_LAB_CALC', 'HCO3_RPC_LAB', 'K', 'K_SOURCE', 'LI_RPC_LAB', 'MG', 'MG_SOURCE', 'MN', 'MN_FLAG', 'MN_SOURCE', 'MO_RPC_LAB', 'MO_RPC_LAB_FLAG', 'NA', 'NA_SOURCE', 'NH3_UN_ION_RPC_LAB', 'NH3_UN_ION_RPC_LAB_FLAG', 'NH3T', 'NH3T_FLAG', 'NH3T_SOURCE', 'NI', 'NI_FLAG', 'NI_SOURCE', 'NO2', 'NO2_FLAG', 'NO2_SOURCE', 'NO3', 'NO3_FLAG', 'NO3_SOURCE', 'NOX', 'NOX_FLAG', 'NOX_SOURCE', 'OH_ENV_LAB_CALC', 'PB', 'PB_FLAG', 'PB_SOURCE', 'PH', 'PH_FLAG', 'PH_SOURCE', 'PH_FIELD', 'PH_FIELD_FLAG', 'PH_FIELD_SOURCE', 'RB_RPC_LAB', 'SALINITY_FIELD_ENV', 'SB', 'SB_FLAG', 'SB_SOURCE', 'SE_RPC_LAB', 'SE_RPC_LAB_FLAG', 'SIN_ENV_LAB_CALC', 'SN_RPC_LAB', 'SN_RPC_LAB_FLAG', 'SO4', 'SO4_FLAG', 'SO4_SOURCE', 'SR_RPC_LAB', 'SS_ENV_LAB', 'SS_ENV_LAB_FLAG', 'TC_MPN_RPC_LAB', 'TC_MPN_RPC_LAB_FLAG', 'TDS_CALC', 'TDS_CALC_SOURCE', 'TE_RPC_LAB', 'TE_RPC_LAB_FLAG', 'TEMP_FIELD', 'TEMP_FIELD_FLAG', 'TEMP_FIELD_SOURCE', 'TKN', 'TKN_FLAG', 'TKN_SOURCE', 'TL_RPC_LAB', 'TL_RPC_LAB_FLAG', 'TN', 'TN_FLAG', 'TN_SOURCE', 'TOC', 'TOC_FLAG', 'TOC_SOURCE', 'TP', 'TP_FLAG', 'TP_SOURCE', 'TURB', 'TURB_FLAG', 'TURB_SOURCE', 'TURB_FIELD', 'TURB_FIELD_SOURCE', 'U_RPC_LAB', 'U_RPC_LAB_FLAG', 'V_RPC_LAB', 'V_RPC_LAB_FLAG', 'ZN', 'ZN_FLAG', 'ZN_SOURCE']

nb_surface_water_monitoring_p4 = nb_surface_water_monitoring_p4[cols]

In [None]:
nb_surface_water_monitoring_p4.columns.tolist()

##### Create a final copy of processed data

In [None]:
nb_surface_water_monitoring = nb_surface_water_monitoring_p4.copy()

In [None]:
#Export Combined Dataset to a CSV

nb_surface_water_monitoring.to_csv("data/nb_surface_water_monitoring.csv", sep=',',index=False,encoding='utf-8-sig')

#Shape of row data
nb_surface_water_monitoring.shape

In [None]:
nb_surface_water_monitoring.info()

In [None]:
nb_surface_water_monitoring.columns.tolist()

#### Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Null Values 

In [None]:

#Plot null values
def plot_null_values(dataset,group_by,year_filter_switch, year_filter):
    if year_filter_switch:
        dataset = dataset[(dataset['YEAR'] == year_filter)]
    
    dataset = dataset.groupby([group_by])
    # extract keys from groups
    keys = dataset.groups.keys()

    totalCols=1
    totalRows=math.ceil(len(dataset)/totalCols)
    
    fig = plt.figure(figsize=((totalCols+3)*4,(totalRows+1)*5))
    plt.subplots_adjust(wspace=0.2, hspace=0.7)

    for index, x in enumerate(keys):
        null_columns = []
        null_column_values = []

        #print(dataset.get_group(x).columns[2])

        for i in range(0,len(dataset.get_group(x).columns),1):
            if(dataset[dataset.get_group(x).columns[i]].get_group(x).isna().sum() > 0):
                null_columns.append(dataset.get_group(x).columns[i])
                null_column_values.append(dataset[dataset.get_group(x).columns[i]].get_group(x).isna().sum())

        globals()[f"ax_count_plots_{index}"] = fig.add_subplot(totalRows,totalCols,(index+1))
        globals()[f"ax_count_plots_{index}"].set_title(x.upper(),backgroundcolor='gray')
        

        globals()[f"ax_count_plots_{index}"].bar(null_columns,null_column_values)
        
        globals()[f"ax_count_plots_{index}"].set(xlabel=None)
        globals()[f"ax_count_plots_{index}"].tick_params(axis='x', labelrotation = 90)
        
    plt.show()


In [None]:
#plot_null_values(nb_surface_water_monitoring,'STATION_NAME',False,2010)

#### Analyse variables 

In [None]:

#This is just an example of a line graph, visualization can be better
def linechart_of_categories(dataset,group_by,time_column,value_column):

    dataset = dataset.set_index(time_column)
    dataset = dataset.groupby([group_by])
    # extract keys from groups
    keys = dataset.groups.keys()
    totalRows = 0
    for index, x in enumerate(keys):
        if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            totalRows+=1
    
    totalCols=3
    totalRows=math.ceil(totalRows/totalCols)
    
    fig = plt.figure(figsize=((totalCols+3)*3,(totalRows+1)*5))
    plt.subplots_adjust(wspace=0.2, hspace=0.6)
    newInx = 1
    for index, x in enumerate(keys):
        if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            globals()[f"ax_count_plots_{index}"] = fig.add_subplot(totalRows,totalCols,newInx)
            globals()[f"ax_count_plots_{index}"].set_title(x.upper())
            #if(len(dataset[value_column].get_group(x))!=dataset[value_column].get_group(x).isna().sum()):
            dataset[value_column].get_group(x).plot()
            
            globals()[f"ax_count_plots_{index}"].set(xlabel=None)
            globals()[f"ax_count_plots_{index}"].tick_params(axis='x', labelrotation = 90)
            newInx+=1
    plt.show()


In [None]:
#linechart_of_categories(nb_surface_water_monitoring,'STATION_NAME','DATE','COND')

In [None]:
#Display unique values

def unique_values__or_count(listOfColumns,options,dataset):
    for x in range(0, len(listOfColumns), 1):
        if(options=="unique"):
            unique_values_str = dataset[listOfColumns[x]].unique()
            print("unique_values " + listOfColumns[x])
            print(unique_values_str)
            print("------------------------")
        if(options=="count"):
            values_distribution = dataset[listOfColumns[x]].value_counts()
            print("-----------"+listOfColumns[x] +"------------")
            print(values_distribution)
            print("-----------------------")

In [None]:
#Check station values

unique_values__or_count(['STATION_NAME'],"unique",nb_surface_water_monitoring)


#### Data Validation

In [None]:
#Import cleaned data 

nb_surface_water_monitoring_validate = pd.read_csv("data/nb_surface_water_monitoring.csv", low_memory=False)

In [None]:
nb_surface_water_monitoring_validate.columns.tolist()

In [None]:
Col_name_to_validate = "AS_FLAG"

In [None]:
nb_surface_water_monitoring_validate["DATE"] = pd.to_datetime(nb_surface_water_monitoring_validate["DATE"],format='%Y/%m/%d')
nb_surface_water_monitoring_validate = nb_surface_water_monitoring_validate[["STATION_NAME", "DATE", Col_name_to_validate]].copy()

In [None]:
#Import original data 

nb_surface_water_monitoring_original = pd.read_csv("raw_data/2000-2007.csv", low_memory=False, sep=",")


In [None]:
print("List of Columns")
print(nb_surface_water_monitoring_original.columns.to_list())
print("---------------")

In [None]:
col_to_validate_with = "As (µg/L) ENV-Lab - Flag"

In [None]:
#Rename columns 
nb_surface_water_monitoring_original.rename(columns={'Station': 'STATION_NAME', 'FromDate': 'DATE',col_to_validate_with:Col_name_to_validate}, inplace=True)

#nb_surface_water_monitoring_original = nb_surface_water_monitoring_original.rename(columns=lambda x: clean_column_names(x)[0])

In [None]:
#Change date format
nb_surface_water_monitoring_original["DATE"] = pd.to_datetime(nb_surface_water_monitoring_original["DATE"],format='%Y/%m/%d')

#Trim data to validate an analyte
nb_surface_water_monitoring_original = nb_surface_water_monitoring_original[["STATION_NAME", "DATE", Col_name_to_validate]].copy()

In [None]:
nb_surface_water_monitoring_validate_results = pd.merge(nb_surface_water_monitoring_validate, nb_surface_water_monitoring_original, on=["STATION_NAME", "DATE",Col_name_to_validate], how='right', indicator='Exist')
nb_surface_water_monitoring_validate_results['Exist'] = np.where(nb_surface_water_monitoring_validate_results.Exist == 'both', True, False)

In [None]:
unique_values__or_count(['Exist'],"count",nb_surface_water_monitoring_validate_results)

In [None]:
list_of_missing_rows = nb_surface_water_monitoring_validate_results[(nb_surface_water_monitoring_validate_results['Exist'] == False)].copy()

list_of_missing_rows.head(5)