# ========================================

# Import Dependencies

# ========================================

In [1]:
import os
# ---------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ---------------------------------------------------------
from splinter import Browser
# ---------------------------------------------------------
from bs4 import BeautifulSoup as BS
# ---------------------------------------------------------
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
# ---------------------------------------------------------
import urllib.request, urllib.error, urllib.parse
# ---------------------------------------------------------
from itertools import dropwhile
# ---------------------------------------------------------
import time



# import warnings
# import pymongo
# import datetime
# import requests
# import datefinder

# ========================================

# URLs for the Websites

# ========================================

In [None]:
USGS_WaterWatch_url = "https://waterwatch.usgs.gov/index.php?id=wwdrought"

# +++++++++++++++++++++++++++++++++++++++

# ========================================

# Step 00 - Scrape the Tables from USGS' WaterWatch Retrieval Summary of 7-day Flow Conditions Using Pandas

# ========================================

In [None]:
# Importing the USGS' WaterWatch Retrieval Summary of 7-day Flow Conditions Websites
River_Stream_7Day_Flow_Conditions_Tables = pd.read_html("https://waterwatch.usgs.gov/index.php?id=wwdrought")

In [None]:
len(River_Stream_7Day_Flow_Conditions_Tables)

In [None]:
River_Stream_7Day_Flow_Conditions_Tables[5]

# +++++++++++++++++++++++++++++++++++++++

# ========================================

# Step 2 - Scraping the Website with BeautifulSoup

# ========================================

## Scrape a webpage and create a BeautifulSoup object from the results

## 2.1 USGS' WaterWatch

### 2.1.1 Retrieve the data/information on USGS' WaterWatch website

# +++++++++++++++++++++++++++++++++++++++

In [None]:
# identify location of chromedriver and store it as a variable
chromedriver = !which chromedriver
print(type(chromedriver))
chromedriver[0]

In [None]:
# Retrieve page with the requests module
executable_path = {"executable_path": "chromedriver"}
# OR
# executable_path = {"executable_path": chromedriver[0]}
# I am not sure why the above works and the below statement will not. I think it's b/c chromebriver is a class 'IPython.utils.text.SList'?
# executable_path = {"executable_path": chromedriver}

browser = Browser('chrome', **executable_path, headless = False)

In [None]:
# URL of page to be scraped
url = "https://waterwatch.usgs.gov/index.php?id=wwdrought"
browser.visit(url)
# window = browser.windows.current

In [None]:
html = browser.html
soup = BS(html, "html.parser")

In [None]:
type(soup)

In [None]:
# Print the html code of the NASA's Mars website
print(soup.prettify())

In [None]:
# Click FULL IMAGE to see a large thumbnail of the featured image 
browser.click_link_by_id('st')
# browser.fill('st', "Idaho")

In [None]:
# https://stackoverflow.com/questions/19392466/python-beautifulsoup-get-select-value-not-text

for option in soup.find_all('option'):
    print(option)

In [None]:
# from selenium import webdriver
# from selenium.webdriver.support.ui import Select

driver = webdriver.Chrome()
driver.get(USGS_WaterWatch_url)

In [None]:
# https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python

select = Select(driver.find_element_by_id('st'))

# Select by visible text
select.select_by_visible_text('Idaho')

In [None]:
# Fill in Input Fills
#     - https://stackoverflow.com/questions/25537567/how-to-open-website-and-fill-in-input-using-selenium-webdriver
# Clear the Input Field
#     - http://10minbasics.com/clear-fill-input-field-with-selenium/

element = driver.find_element_by_name("bdt")
element.clear()
element.send_keys("1990-01-01")

element = driver.find_element_by_name("edt")
element.clear()
element.send_keys("1990-12-31")

In [None]:
# Press/Click a Button Without an ID
#     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id

NEXT_BUTTON_XPATH = '//input[@type="submit" and @value="GO"]'

button = driver.find_element_by_xpath(NEXT_BUTTON_XPATH)
button.click()

In [None]:
# https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class

# test = soup.find_all("div", class_= "ztable")
# test

# https://stackoverflow.com/questions/20522820/how-to-get-tbody-from-table-from-python-beautiful-soup
soup.findAll('table')[0].findAll('tr')

In [None]:
# Grab All Page Source on the Page
soup_lxml = BS(driver.page_source, "lxml")

# Find All the Tables on the Page
tables = soup_lxml.find_all("table")
tables

In [None]:
# Read the Tables with Pandas
dfs = pd.read_html(str(tables))

In [None]:
# Access the Table
print(f"Number of Tables on the page: {len(dfs)}")
print("*********************************************************************************************************")
print(f"Data Types for the Table: {dfs[11].dtypes}")
print("*********************************************************************************************************")
print(f"Number of Rows in the dataframe: {len(dfs[11])}")
dfs[11].head()

In [None]:
print(dfs[11]["USGSstationnumber"].value_counts())

In [None]:
import requests, json
text = requests.get("https://waterwatch.usgs.gov/index.php?id=wwdrought").text
data = json.loads(text)
print(data['Scty'])

In [None]:
for tr in soup.find_all('tr')[2:]:
    tds = tr.find_all('td')
    print (tds)#"Nome: %s, Cognome: %s, Email: %s" % \
#           (tds[0].text, tds[1].text, tds[2].text)

# +++++++++++++++++++++++++++++++++++++++

# ========================================

# Step 1 - Scraping

# ========================================

## Scrape a webpage and create a BeautifulSoup object from the results

## 2.1 USGS' Science for a Changing World

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [2]:
# from selenium import webdriver
# from selenium.webdriver.support.ui import Select

url = "https://waterwatch.usgs.gov/index.php?id=wwdrought"

driver = webdriver.Chrome()
driver.get(url)

In [3]:
# https://stackoverflow.com/questions/12323403/how-do-i-find-an-element-that-contains-specific-text-in-selenium-webdriver-pyth
# https://selenium-python.readthedocs.io/locating-elements.html

driver.find_element_by_xpath("//*[contains(text(), 'Current Streamflow')]").click()

In [4]:
# https://stackoverflow.com/questions/52873433/python-selenium-clicking-based-on-alt-attribute

driver.find_element_by_css_selector('[alt="id"]').click()

In [5]:
# Press/Click a Button Without an ID
#     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id

lst_all_statns = '//input[@type="radio" and @value="statelist"]'

button = driver.find_element_by_xpath(lst_all_statns)
button.click()

In [6]:
# https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python

select = Select(driver.find_element_by_id('select_display'))

# Select by visible text
# select.select_by_visible_text('Daily Stage and Streamflow')

# Select by value text
select.select_by_value('dailystagedischarge')

In [7]:
# https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python

select = Select(driver.find_element_by_id('group_table_by'))

# Select by visible text
# select.select_by_visible_text('Daily Stage and Streamflow')

# Select by value text
select.select_by_value('county_cd')

In [8]:
# Press/Click a Button Without an ID
#     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id

sbmt_bttn = '//input[@type="submit" and @value="go"]'

button = driver.find_element_by_xpath(sbmt_bttn)
button.click()

In [None]:
# Find All the Station Numbers Scrap the table with all the stations and use that table to loop 
# through and click each station's link.

In [9]:
crrnt_url = driver.current_url

In [10]:
# https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class

statn_table = pd.read_html(crrnt_url)
print(f"Number of Tables on the current page: {len(statn_table)}")
statn_table[1]

Number of Tables on the current page: 3


Unnamed: 0,StationNumber,Station name,Dailymeangage height(ft)2/26,Dailymeanstream- flow (ft3/s)2/26
0,Ada County,Ada County,Ada County,Ada County
1,13206000,BOISE RIVER AT GLENWOOD BRIDGE NR BOISE ID,,271
2,13206305,BOISE RIVER SOUTH CHANNEL AT EAGLE ID,,235
3,13206400,"EAGLE DRAIN AT EAGLE, ID",,7.38
4,Bannock County,Bannock County,Bannock County,Bannock County
...,...,...,...,...
271,13011500,"PACIFIC CREEK AT MORAN, WY",,--
272,13011900,BUFFALO FORK AB LAVA CREEK NR MORAN WY,,--
273,13013650,"SNAKE RIVER AT MOOSE, WY",,849
274,13015000,"GROS VENTRE RIVER AT ZENITH, WY",,--


In [None]:
print(statn_table[1].dtypes)

In [11]:
statn_table_df = statn_table[1]

In [12]:
# Seperate the text from the digits in the "StationNumber" column.
# https://stackoverflow.com/questions/56851679/how-to-separate-pandas-column-that-contains-values-stored-as-text-and-numbers-in

statn_table_df_splt_StatnNmbr = statn_table_df.join(statn_table_df.pop('StationNumber').str.extract('(?P<numbers>\d+)?(?P<text>\D+)?').fillna(''))
statn_table_df_splt_StatnNmbr

Unnamed: 0,Station name,Dailymeangage height(ft)2/26,Dailymeanstream- flow (ft3/s)2/26,numbers,text
0,Ada County,Ada County,Ada County,,Ada County
1,BOISE RIVER AT GLENWOOD BRIDGE NR BOISE ID,,271,13206000,
2,BOISE RIVER SOUTH CHANNEL AT EAGLE ID,,235,13206305,
3,"EAGLE DRAIN AT EAGLE, ID",,7.38,13206400,
4,Bannock County,Bannock County,Bannock County,,Bannock County
...,...,...,...,...,...
271,"PACIFIC CREEK AT MORAN, WY",,--,13011500,
272,BUFFALO FORK AB LAVA CREEK NR MORAN WY,,--,13011900,
273,"SNAKE RIVER AT MOOSE, WY",,849,13013650,
274,"GROS VENTRE RIVER AT ZENITH, WY",,--,13015000,


In [None]:
# # NEW CELL

# if statn_table_df_splt_StatnNmbr["text"][1] == "":
#     print("No County")

In [13]:
# Create a dataframe with only the "numbers" column from the "statn_table_df_splt" Dataframe
statn_table_df_splt_StatnNmbr_nmbrs_clmn = pd.DataFrame(statn_table_df_splt_StatnNmbr["numbers"])

# Replace the Empty Rows with "NaN"
# https://www.kite.com/python/answers/how-to-drop-empty-rows-from-a-pandas-dataframe-in-python

nan_value = float("NaN")
statn_table_df_splt_StatnNmbr_nmbrs_clmn.replace("", nan_value, inplace=True)

In [None]:
# Count the Number of Null Values in the Dataframe
# https://stackoverflow.com/questions/26266362/how-to-count-the-nan-values-in-a-column-in-pandas-dataframe

statn_table_df_splt_StatnNmbr_nmbrs_clmn.isna().sum()

In [14]:
# Remove the "NaN" Null Values
statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls = statn_table_df_splt_StatnNmbr_nmbrs_clmn.dropna()
print(f"Number of Null Values: {statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls.isna().sum()}")
print("*********************************************************************************************************")
print(statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls.dtypes)

Number of Null Values: numbers    0
dtype: int64
*********************************************************************************************************
numbers    object
dtype: object


In [None]:
# # Convert the "numbers" column to an Interger Data Type

# statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls["numbers"] = statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls["numbers"].astype(int)

# print(statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls.dtypes)

In [None]:
statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls

In [15]:
bgn_date = "1990-01-01"
end_date = "1990-01-31"

In [21]:
statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr

for row in statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls.index:
    
    # Find the Hyper Link for One Station
    statn_nmbr = statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls["numbers"][row]


    # https://stackoverflow.com/questions/32874539/using-a-variable-in-xpath-in-python-selenium
    # driver.find_element_by_xpath("//*[contains(text(), '13206000')]").click()
    driver.find_element_by_xpath("//*[contains(text(),'" +statn_nmbr+"')]").click()


    # Select the Tab-separated Output format
    lst_all_statns = '//input[@type="radio" and @value="rdb"]'

    lst_all_statns_button = driver.find_element_by_xpath(lst_all_statns)
    lst_all_statns_button.click()


    # Enter Values for the Begin Date and End Date
    # Fill in Input Fills
    #     - https://stackoverflow.com/questions/25537567/how-to-open-website-and-fill-in-input-using-selenium-webdriver
    # Clear the Input Field
    #     - http://10minbasics.com/clear-fill-input-field-with-selenium/

    element = driver.find_element_by_name("begin_date")
    element.clear()
    element.send_keys(bgn_date)

    element = driver.find_element_by_name("end_date")
    element.clear()
    element.send_keys(end_date)

    # Press/Click a Button Without an ID
    #     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id
    #     - https://stackoverflow.com/questions/21322116/using-selenium-in-python-to-click-select-a-radio-button/21322160

    sbmt_bttn = '//input[@id="go_available_button"]'

    sbmt_bttn_button = driver.find_element_by_xpath(sbmt_bttn)
    sbmt_bttn_button.click()


#     from selenium.webdriver import ActionChains

    actionChains = ActionChains(driver)


    # Save the data file to This Computer

    # How to Open and Write to a File on This Computer
    #     - https://programminghistorian.org/en/lessons/working-with-web-pages
    # How to Change the Location of the File
    #     - https://www3.ntu.edu.sg/home/ehchua/programming/webprogramming/Python_FileText.html

#     import urllib.request, urllib.error, urllib.parse
#     import os


    response = urllib.request.urlopen(driver.current_url)
    webContent = response.read()

    fle_nm = "Data/Idaho_Streamflow_Data/" + statn_nmbr + ".txt"

    f = open(fle_nm, 'wb')
    f.write(webContent)
    f.close

# Go back to the original URL for the station
    driver.back()
    
# Select the "Time-series: Current/Historical Observations" from the dropdown list, this will 
# create page which includes a table with extended streamflow statistics.

    crrnt_hstrcl_obsrvtns = '//input[@value="uv"]'
    
    select = Select(driver.find_element_by_id("select_data_1"))
    select.select_by_visible_text("Time-series:   Current/Historical Observations")
    
# Get the extended year streamflow min, max, median, mean, 25th percentile, and 75th percentile
    # https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class
    
    crrnt_hstrcl_obsrvtns_url = driver.current_url
    crrnt_hstrcl_obsrvtns_html = pd.read_html(crrnt_hstrcl_obsrvtns_url)
    print(f'Station Number: {statn_nmbr}')
    print(f"Number of Tables on the current page: {len(crrnt_hstrcl_obsrvtns_html)}")
#     print(type(crrnt_hstrcl_obsrvtns_html[1]))
    
    if len(crrnt_hstrcl_obsrvtns_html) == 2:
        
#         print(crrnt_hstrcl_obsrvtns_html[1])

        extndd_yrs_sttstcs = crrnt_hstrcl_obsrvtns_html[1]


        # Reference: 
    #     - Find column whose name contains a specific string:
    #         - https://stackoverflow.com/questions/21285380/find-column-whose-name-contains-a-specific-string

    # print(extndd_yrs_sttstcs.columns)
        extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if 'Min' in col_nm]
        min_strmflw = extndd_yrs_sttstcs_cols[0]
#         print(f'Min Flow: {min_strmflw}')

        extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if 'Max' in col_nm]
        max_strmflw = extndd_yrs_sttstcs_cols[0]
#         print(f'Max Flow: {max_strmflw}')

        extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if '25th' in col_nm]
        _25th_prcntle_strmflw = extndd_yrs_sttstcs_cols[0]
#         print(f'25th_prcntle: {_25th_prcntle_strmflw}')

        extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if '75th' in col_nm]
        _75th_prcntle_strmflw = extndd_yrs_sttstcs_cols[0]
#         print(f'75th_prcntle: {_75th_prcntle_strmflw}')



            # References:
        #     - Return the Index label if some condition is satisfied over a column in Pandas Dataframe:
        #         - geeksforgeeks.org/return-the-index-label-if-some-condition-is-satisfied-over-a-column-in-pandas-dataframe/
        #     - Pandas update a cell:
        #         - https://kanoki.org/2019/04/12/pandas-how-to-get-a-cell-value-and-update-it/


        # Find the Index for the Station  
        indx_lbl = statn_table_df_splt_StatnNmbr[statn_table_df_splt_StatnNmbr["numbers"] == statn_nmbr].index.tolist()
        print(f'Index No: {indx_lbl[0]}')

        # Append the Extened Water Years Average to the "statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls"
        # Dataframe
    # #     statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_min": indx_lbl}, ignore_index=True)
        statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_min"] = extndd_yrs_sttstcs[min_strmflw][0]

    #     statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_max": indx_lbl}, ignore_index=True)
        statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_max"] = extndd_yrs_sttstcs[max_strmflw][0]

    #     statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_median": indx_lbl}, ignore_index=True)
        statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_median"] = extndd_yrs_sttstcs["Median"][0]

    #     statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_mean": indx_lbl}, ignore_index=True)
        statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_mean"] = extndd_yrs_sttstcs["Mean"][0]

    #     statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"25th_prcntle": indx_lbl}, ignore_index=True)
        statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "25th_prcntle"] = extndd_yrs_sttstcs[_25th_prcntle_strmflw][0]

    #     statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"75th_prcntle": indx_lbl}, ignore_index=True)
        statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "75th_prcntle"] = extndd_yrs_sttstcs[_75th_prcntle_strmflw][0]
    
    elif len(crrnt_hstrcl_obsrvtns_html) < 2:
        print("No Extened Water Statistics")
    
    print("********************************************************************************")
    
# Go back to the URL with the list of Stations and Counties
    driver.back()
    driver.back()

Station Number: 13206000
Number of Tables on the current page: 2
Index No: 1
********************************************************************************
Station Number: 13206305
Number of Tables on the current page: 2
Index No: 2
********************************************************************************
Station Number: 13206400
Number of Tables on the current page: 2
Index No: 3
********************************************************************************
Station Number: 13073000
Number of Tables on the current page: 2
Index No: 5
********************************************************************************
Station Number: 13075000
Number of Tables on the current page: 2
Index No: 6
********************************************************************************
Station Number: 13075500
Number of Tables on the current page: 2
Index No: 7
********************************************************************************
Station Number: 13075910
Number of Tables on the cur

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//input[@type="radio" and @value="rdb"]"}
  (Session info: chrome=88.0.4324.192)


# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
from selenium.webdriver import ActionChains

actionChains = ActionChains(driver)

In [None]:
# Save the data file to This Computer

# How to Open and Write to a File on This Computer
#     - https://programminghistorian.org/en/lessons/working-with-web-pages
# How to Change the Location of the File
#     - https://www3.ntu.edu.sg/home/ehchua/programming/webprogramming/Python_FileText.html

import urllib.request, urllib.error, urllib.parse
import os

# url='https://waterdata.usgs.gov/id/nwis/dv?cb_00060=on&format=rdb&site_no=13206000&referred_module=sw&period=&begin_date=1990-01-01&end_date=1990-12-31'

response = urllib.request.urlopen(driver.current_url)
webContent = response.read()

output_fle_nm = "Data/Idaho_Streamflow_Data/" + statn_nmbr + ".txt"

f = open(output_fle_nm, 'wb')
f.write(webContent)
f.close

In [None]:
driver.back()
driver.back()

In [None]:
crrnt_hstrcl_obsrvtns_url = driver.current_url

In [None]:
# https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class

crrnt_hstrcl_obsrvtns_html = pd.read_html(crrnt_hstrcl_obsrvtns_url)
print(f"Number of Tables on the current page: {len(crrnt_hstrcl_obsrvtns_html)}")
print(type(crrnt_hstrcl_obsrvtns_html[1]))

extndd_yrs_sttstcs = crrnt_hstrcl_obsrvtns_html[1]
extndd_yrs_sttstcs

In [None]:
# Reference: 
#     - Find column whose name contains a specific string:
#         - https://stackoverflow.com/questions/21285380/find-column-whose-name-contains-a-specific-string

print(extndd_yrs_sttstcs.columns)
extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if 'Min' in col_nm]
min_strmflw = extndd_yrs_sttstcs_cols[0]

extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if 'Max' in col_nm]
max_strmflw = extndd_yrs_sttstcs_cols[0]

extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if '25th' in col_nm]
_25th_prcntle_strmflw = extndd_yrs_sttstcs_cols[0]

extndd_yrs_sttstcs_cols = [col_nm for col_nm in extndd_yrs_sttstcs.columns if '75th' in col_nm]
_75th_prcntle_strmflw = extndd_yrs_sttstcs_cols[0]

In [None]:
# Using the Median because I was the most drastic/losest streamflow
extndd_yrs_sttstcs["Median"]

In [None]:
# References:
#     - Return the Index label if some condition is satisfied over a column in Pandas Dataframe:
#         - geeksforgeeks.org/return-the-index-label-if-some-condition-is-satisfied-over-a-column-in-pandas-dataframe/
#     - Pandas update a cell:
#         - https://kanoki.org/2019/04/12/pandas-how-to-get-a-cell-value-and-update-it/

statn_nmbr = "13206305"

# Find the Index for the Station  
indx_lbl = statn_table_df_splt_StatnNmbr[statn_table_df_splt_StatnNmbr["numbers"] == statn_nmbr].index.tolist()
print(indx_lbl[0])

# Append the Extened Water Years Average to the "statn_table_df_splt_StatnNmbr_nmbrs_clmn_no_nulls"
# Dataframe
# statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_min": indx_lbl}, ignore_index=True)
statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_min"] = extndd_yrs_sttstcs[min_strmflw][0]

# statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_max": indx_lbl}, ignore_index=True)
statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_max"] = extndd_yrs_sttstcs[max_strmflw][0]

# statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_median": indx_lbl}, ignore_index=True)
statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_median"] = extndd_yrs_sttstcs["Median"][0]

# statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"extndd_yrs_mean": indx_lbl}, ignore_index=True)
statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "extndd_yrs_mean"] = extndd_yrs_sttstcs["Mean"][0]

# statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"25th_prcntle": indx_lbl}, ignore_index=True)
statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "25th_prcntle"] = extndd_yrs_sttstcs[_25th_prcntle_strmflw][0]

# statn_table_df_splt_StatnNmbr_test = statn_table_df_splt_StatnNmbr.append({"75th_prcntle": indx_lbl}, ignore_index=True)
statn_table_df_splt_StatnNmbr_test.at[indx_lbl[0], "75th_prcntle"] = extndd_yrs_sttstcs[_75th_prcntle_strmflw][0]

# I need to Do the above without knowing the column names!!!!

In [25]:
statn_table_df_splt_StatnNmbr_test.head(19)

Unnamed: 0,Station name,Dailymeangage height(ft)2/26,Dailymeanstream- flow (ft3/s)2/26,numbers,text,extndd_yrs_min,extndd_yrs_max,extndd_yrs_median,extndd_yrs_mean,25th_prcntle,75th_prcntle
0,Ada County,Ada County,Ada County,,Ada County,,,,,,
1,BOISE RIVER AT GLENWOOD BRIDGE NR BOISE ID,,271,13206000.0,,110.0,7020.0,274.0,1190.0,216.0,1270.0
2,BOISE RIVER SOUTH CHANNEL AT EAGLE ID,,235,13206305.0,,103.0,2570.0,219.0,410.0,167.0,273.0
3,"EAGLE DRAIN AT EAGLE, ID",,7.38,13206400.0,,6.11,23.0,7.7,9.9,6.1,13.0
4,Bannock County,Bannock County,Bannock County,,Bannock County,,,,,,
5,PORTNEUF RIVER AT TOPAZ ID,,118,13073000.0,,89.0,823.0,153.0,172.0,128.0,191.0
6,MARSH CREEK NR MCCAMMON ID,,53.2,13075000.0,,39.0,306.0,90.0,99.0,67.0,113.0
7,PORTNEUF RIVER AT POCATELLO ID,,210,13075500.0,,159.0,1210.0,300.0,333.0,249.0,372.0
8,PORTNEUF RIVER NR TYHEE ID,,383,13075910.0,,335.0,1470.0,501.0,546.0,430.0,559.0
9,Bear Lake County,Bear Lake County,Bear Lake County,,Bear Lake County,,,,,,


# ========================================

# Step 2 - Create a Dataframe for Each Station

# ========================================

## Scrape a webpage and create a BeautifulSoup object from the results

## 2.1 USGS' Science for a Changing World

# +++++++++++++++++++++++++++++++++++++++

In [None]:
# The function to Skip All of the Comments in the File
#     https://cmdlinetips.com/2018/01/3-ways-to-read-a-file-and-skip-initial-comments-in-python/

def is_comment(s):
    """ function to check if a line
         starts with some character.
         Here # for comment
    """
    # return true if a line starts with #
    return s.startswith('#')

In [None]:
# Column Names for the Dataframe
clmn_nms = ["agency", 
            "site_nmbr", 
            "date", 
            "streamflow_rate", 
            "approved/pending"]

In [None]:
# Location of the Files
input_fle_path = os.path.join("Data", "Idaho_Streamflow_Data")

In [None]:
# Dictionary of Data Types for the Dataframe
convert_dict = {
                "streamflow_rate": float
               } 

In [None]:
# List of Files in a Directory
#     - https://careerkarma.com/blog/python-list-files-in-directory/

input_fle_lst = os.listdir("Data/Idaho_Streamflow_Data")
print(input_fle_lst)
# input_fle_nm = input_fle_lst[0]
# print(input_fle_nm)

In [None]:
# How to Ignore Hidden Files
#     - https://stackoverflow.com/questions/15235823/how-to-ignore-hidden-files-in-python-functions

# Create a List of the Files in the Directory
input_fle_dict = {"Station_Nmbr":[], "File_Name": [], "df_Name": []}

for input_fle_nm in os.listdir(input_fle_path):
    if not input_fle_nm.startswith('.') and os.path.isfile(os.path.join(input_fle_path, input_fle_nm)):

#         Append to the File Names to the Directory
        input_fle_dict["Station_Nmbr"].append(input_fle_nm[:-4])
        input_fle_dict["File_Name"].append(input_fle_nm)
        input_fle_dict["df_Name"].append("_" + input_fle_nm[:-4] + "_df")
#         print(input_fle_nm)

In [None]:
statn_table_df_splt_StatnNmbr["text"].last_valid_index()

In [None]:
count_1 = 0

for row in statn_table_df_splt_StatnNmbr["text"]:
#     print(row)
    
    if row != "":
#         print(row)
        count_1 = count_1 + 1
#         cnty_lst.append(row)
        
print(count_1)

# +++++++++++++++++++++++++++++++++++++++

In [None]:
cnty_lst = [{}for x in range(51)]
# cnty_lst[2].append(2050)
# cnty_lst[1]
cnty_lst

In [None]:
# Create a list of list
#     - https://stackoverflow.com/questions/8713620/appending-items-to-a-list-of-lists-in-python


cnty_lst = [[]for x in range(51)]

count_1 = 0
count_2 = 0
count_3 = 0

statn_lst = []
# cnty_lst = [[] * 154]
new_dict = {}


for row in statn_table_df_splt_StatnNmbr["text"]:
#     print(row)
 
# *********************************************************************************************
#                               Step 1: If the row is Not Empty
# *********************************************************************************************
    if row != "":
        print(row)
#         count_1 = count_1
        cnty_lst[count_2].append(row)
        cnty_lst[count_2].append([])
# *********************************************************************************************


# *********************************************************************************************
#                   Step 2: If the row is Empty and the Next Row is Empty
# *********************************************************************************************

    elif row == "" and statn_table_df_splt_StatnNmbr["text"][count_1 + 1] == "":
        print(statn_table_df_splt_StatnNmbr["numbers"][count_1]) 
#         cnty_lst[count_2][1].append(statn_table_df_splt_StatnNmbr["numbers"][count_1])
        
        
# # #         print(statn_table_df_splt_StatnNmbr["numbers"][count])
# # #         count_3 = count_1
# # #         print (count_3)
# #         count_1 = 0


        new_dict = {"Station_Nmbr": statn_table_df_splt_StatnNmbr["numbers"][count_1], 
                                  "File_Name":statn_table_df_splt_StatnNmbr["numbers"][count_1] + ".txt", #input_fle_nm, 
                                  "df_Name": statn_table_df_splt_StatnNmbr["numbers"][count_1] + "_df",#"_" + input_fle_nm[:-4] + "_df", 
                                  "Data": "", 
                                  "Avg_Streamflow": "",
                                  "Prcnt_Below_Avg": ""}
    
        cnty_lst[count_2][1].append(dict(new_dict))
# *********************************************************************************************


# *********************************************************************************************
#                   Step 3: If the row is Empty and the Next Row is Not Empty
# *********************************************************************************************
    elif row == "" and statn_table_df_splt_StatnNmbr["text"][count_1 + 1] != "":
#         cnty_lst[count_2][1].append(statn_table_df_splt_StatnNmbr["numbers"][count_1])
        
        new_dict = {"Station_Nmbr": statn_table_df_splt_StatnNmbr["numbers"][count_1], 
                                  "File_Name":statn_table_df_splt_StatnNmbr["numbers"][count_1] + ".txt", 
                                  "df_Name": statn_table_df_splt_StatnNmbr["numbers"][count_1] + "_df",
                                  "Data": "", 
                                  "Avg_Streamflow": "",
                                  "Prcnt_Below_Avg": ""}
    
        cnty_lst[count_2][1].append(dict(new_dict))
        
        count_2 = count_2 + 1
# *********************************************************************************************
    
    
# *********************************************************************************************
#                               Step 4: Add 1 to the Count
# *********************************************************************************************    
    count_1 = count_1 + 1
# *********************************************************************************************
    

In [None]:
# statn_lst

# cnty_lst[0][0].append(input_fle_dict)
# cnty_lst[0][1].append("test")
# print(cnty_lst)
print(cnty_lst[0])
print("********************************************************************")
print(cnty_lst[0][1])
print("********************************************************************")
print(cnty_lst[0][1][0])
print("********************************************************************")
print(cnty_lst[0][1][0]["Station_Nmbr"])

# +++++++++++++++++++++++++++++++++++++++

In [None]:
cnty_lst = []

count_1 = 0
count_2 = 0
count_3 = 0

statn_lst = []
# cnty_lst = [[] * 154]
new_dict = {}


for row in statn_table_df_splt_StatnNmbr["text"]:
#     print(row)
 
    
    if count_1 + 1 < 276:
# *********************************************************************************************
#                               Step 1: If the row is Not Empty
# *********************************************************************************************
        if row != "":
            statn_lst = []  # This will be a list of dictionaries
            cnty_nm = row
#         print(row)
# #         count_1 = count_1
#         cnty_lst[count_2].append(row)
#         cnty_lst[count_2].append([])
# *********************************************************************************************


# *********************************************************************************************
#                   Step 2: If the row is Empty and the Next Row is Empty
# *********************************************************************************************

        elif row == "" and statn_table_df_splt_StatnNmbr["text"][count_1 + 1] == "":
#         print(statn_table_df_splt_StatnNmbr["numbers"][count_1]) 
#         cnty_lst[count_2][1].append(statn_table_df_splt_StatnNmbr["numbers"][count_1])
        
        
# # #         print(statn_table_df_splt_StatnNmbr["numbers"][count])
# # #         count_3 = count_1
# # #         print (count_3)
# #         count_1 = 0


            new_dict = {"Station_Nmbr": statn_table_df_splt_StatnNmbr["numbers"][count_1], 
                        "File_Name":statn_table_df_splt_StatnNmbr["numbers"][count_1] + ".txt", #input_fle_nm, 
                        "df_Name": statn_table_df_splt_StatnNmbr["numbers"][count_1] + "_df",#"_" + input_fle_nm[:-4] + "_df", 
                        "Data": "", 
                        "Avg_Streamflow": "",
                        "Prcnt_Below_Avg": ""}
    
            statn_lst.append(dict(new_dict))
# *********************************************************************************************


# *********************************************************************************************
#                   Step 3: If the row is Empty and the Next Row is Not Empty
# *********************************************************************************************
        elif row == "": #and statn_table_df_splt_StatnNmbr["text"][count_1 + 1] != "":
#         cnty_lst[count_2][1].append(statn_table_df_splt_StatnNmbr["numbers"][count_1])
        
            new_dict = {"Station_Nmbr": statn_table_df_splt_StatnNmbr["numbers"][count_1], 
                        "File_Name":statn_table_df_splt_StatnNmbr["numbers"][count_1] + ".txt", 
                        "df_Name": statn_table_df_splt_StatnNmbr["numbers"][count_1] + "_df",
                        "Data": "", 
                        "Avg_Streamflow": "",
                        "Prcnt_Below_Avg": ""}
    
            statn_lst.append(dict(new_dict))
        
#         count_2 = count_2 + 1
        
            cnty_lst.append(dict({cnty_nm: statn_lst}))
        
# *********************************************************************************************


# *********************************************************************************************
#                   Step 3: If the row is Empty and the Next Row is Not Empty
# *********************************************************************************************  
    if count_1 + 1 > 275 and row == "":
        print (count_1)



# *********************************************************************************************
#                   Step 3: If the row is Empty and the Next Row is Not Empty
# *********************************************************************************************
        
        new_dict = {"Station_Nmbr": statn_table_df_splt_StatnNmbr["numbers"][count_1], 
                    "File_Name":statn_table_df_splt_StatnNmbr["numbers"][count_1] + ".txt", 
                    "df_Name": statn_table_df_splt_StatnNmbr["numbers"][count_1] + "_df",
                    "Data": "", 
                    "Avg_Streamflow": "",
                    "Prcnt_Below_Avg": ""}

        statn_lst.append(dict(new_dict))

        cnty_lst.append(dict({cnty_nm: statn_lst}))
# *********************************************************************************************
    
    
# *********************************************************************************************
#                               Step 4: Add 1 to the Count
# *********************************************************************************************    
    count_1 = count_1 + 1
# *********************************************************************************************
    

In [None]:
import pprint
pprint.pprint(cnty_lst)

In [None]:
# statn_lst
# cnty_lst = {cnty_nm: statn_lst}
# cnty_lst[0][0].append(input_fle_dict)
# cnty_lst[0][1].append("test")
# print(cnty_lst)
# print("********************************************************************")
# print(new_dict)
# print(statn_lst)
pprint.pprint(cnty_lst[50])
print("********************************************************************")
pprint.pprint(cnty_lst[50]["Teton County, Wyoming"])
print("********************************************************************")
pprint.pprint(cnty_lst[50]["Teton County, Wyoming"][0])
print("********************************************************************")
pprint.pprint(cnty_lst[50]["Teton County, Wyoming"][0]["Station_Nmbr"])

In [None]:
key_list = list(cnty_lst[0].keys())
val_list = list(cnty_lst[0].values())
key_list[0]
print(cnty_lst[0][key_list[0]][0]["File_Name"])

input_fle_path + "/" + cnty_lst[0][key_list[0]][0]["File_Name"]
# val_list

# position = val_list.index(1)


# *********************************************************************************************
# *********************************************************************************************
#     Create the Dataframe to Store the Streamflow Data from the .txt File
    df = pd.DataFrame(columns = clmn_nms)
#     df_nm = "_" + statn_nm + "_df"
# *********************************************************************************************

# *********************************************************************************************
#                           Step 2 Create a Dataframe for the Streamflow
# *********************************************************************************************
# Loop Through the .txt File and Store the Data into a Dataframe
    with open(input_fle_path_fr_lp,'r') as fh:
        for curline in dropwhile(is_comment, fh):
    #         print(f"Index Number: {count} {curline}")
    #         count = count + 1



    # Split a String
    #     - https://www.geeksforgeeks.org/python-string-split/
    # Pandas Series
    #     - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html
            to_append = curline[:-1].split("\t")
            a_series = pd.Series(to_append, index = clmn_nms)

    #             Dataframe
    #             Dataframe Name
            statn_nm = input_fle_nm[:-4]

    #             Append Data to the Dataframe
            df= df.append(a_series, ignore_index=True)
    
    
    
    
#     print(input_fle_nm)
# Delete the first 2 Rows of the Dataframe Because they are not Data
    df = df.drop(index = [0, 1])
# Reset the Index so that it Starts with 0
    df = df.reset_index(drop = True)
# Change the Data Types of Each Column
    df = df.astype(convert_dict) 
# Change the Date Column to a datetime Data Type
    df['date']= pd.to_datetime(df['date'])
    
    avg_strmflw = df["streamflow_rate"].mean(axis = 0)
    print(f"Average Streamflow: {avg_strmflw}")
#     print(df["streamflow_rate"])
#     print("********************************************************************")
    print (df)
    
    count = 0
    print(len(df))
    
    for i_df in range(len(df)):
#         print (df["streamflow_rate"][2])
        print (df["streamflow_rate"][i_df])
#         print (df_row)
        if df["streamflow_rate"][i_df] < avg_strmflw:
            count = count + 1
            
            print ("True")
        elif df["streamflow_rate"][i_df] > avg_strmflw:
            print ("False")
        print("********************************************************************")

    pct_blw_avg = (count / len(df) * 100)
    print (pct_blw_avg)
# Add a value into an empty dictionay element
#     - https://www.pluralsight.com/guides/manipulating-lists-dictionaries-python
    statn_data_lst_of_dicts[i].update({"Data": df})
    statn_data_lst_of_dicts[i].update({"Avg_Streamflow": avg_strmflw})
    statn_data_lst_of_dicts[i].update({"Prcnt_Below_Avg": pct_blw_avg})
#     input_fle_dict["Data"].append(df)

    # df_nm = df_nm.drop(index = [0, 1])
    # "_" + statn_nm + "_df" = df
    # df_nm
    # df
    # df.drop(index = [0, 1])

#### def find(name, path):
for root, dirs, files in os.walk(input_fle_path):
    if "13073000.txt" in files:
        print (os.path.join(root, "13073000.txt"))

In [None]:
def find_all("13073000.txt", input_fle_path):
    result = []
    for root, dirs, files in os.walk(input_fle_path):
        if name in files:
            result.append(os.path.join(root, name))
    return result

In [None]:
# How to Ignore Hidden Files
#     - https://stackoverflow.com/questions/15235823/how-to-ignore-hidden-files-in-python-functions

# # *********************************************************************************************
# # Create a List of the Files in the Directory
# input_fle_dict = {"Station_Nmbr":[], "File_Name": [], "df_Name": [], "Data": [], "Avg_Streamflow": []}
# # *********************************************************************************************

# Create a List to Store/Save the Streamflow Data, Data will be Saved as a List of Dictionaries
statn_data_lst_of_dicts = []
input_fle_dict = {}



# # *********************************************************************************************
# Put the Station in a list by County Name
# First Find the station in the county  

if statn_table_df_splt_StatnNmbr["text"][1] == "":
    print("No County")
# # *********************************************************************************************



for input_fle_nm in os.listdir(input_fle_path):
# Skip the Hidden Files in the Directory
    if not input_fle_nm.startswith('.') and os.path.isfile(os.path.join(input_fle_path, input_fle_nm)):
# # *********************************************************************************************
# #         Append to the File Names to the Directory
#         input_fle_dict["Station_Nmbr"].append(input_fle_nm[:-4])
#         input_fle_dict["File_Name"].append(input_fle_nm)
#         input_fle_dict["df_Name"].append("_" + input_fle_nm[:-4] + "_df")
#         input_fle_dict["df_Name"].append("_" + input_fle_nm[:-4] + "_df")
# #         print(input_fle_nm)
# # *********************************************************************************************


        input_fle_dict = {"Station_Nmbr": input_fle_nm[:-4], 
                          "File_Name": input_fle_nm, 
                          "df_Name": "_" + input_fle_nm[:-4] + "_df", 
                          "Data": "", 
                          "Avg_Streamflow": "",
                          "Prcnt_Below_Avg": ""}

# Append to the File Names to the Directory
    statn_data_lst_of_dicts.append(dict(input_fle_dict))


In [None]:
# https://note.nkmk.me/en/python-list-clear-pop-remove-del/#:~:text=In%20Python%2C%20use%20list%20methods,with%20an%20index%20or%20slice.

del statn_data_lst_of_dicts[0]
statn_data_lst_of_dicts

In [None]:
statn_data_lst_of_dicts[1]['Station_Nmbr']

In [None]:
# for i, station in enumerate(dict_nm[key]):
for i, a_dict in enumerate(statn_data_lst_of_dicts):
    print(statn_data_lst_of_dicts[i]['File_Name'])

In [None]:
statn_data_lst["Station_Nmbr"][0]
# input_fle_dict["File_Name"][0]

In [None]:
for station in input_fle_dict["df_Name"]:
    print(station)

In [None]:
input_fle_nm = input_fle_lst[0]
input_fle_nm

In [None]:
# https://stackoverflow.com/questions/40482738/how-to-name-dataframe-with-variables-in-pandas

N = 10 # 5 in sample
dfs = {'name' + str(i):df for i in range(1,N)}
print (dfs)

In [None]:
dfs["name2"].head()

In [None]:
N = 10 # 5 in sample
# for input_fle_nm in input_fle_lst:
input_fle_nm =""
dfs = {input_fle_nm:df for input_fle_nm in input_fle_dict["df_Name"]}
# print (input_fle_nm)
print (dfs)

In [None]:
statn_data_lst_of_dicts[2]['File_Name']

In [None]:
for i, a_dict in enumerate(statn_data_lst_of_dicts):
# *********************************************************************************************
#                                   Step 1 Create Variables
# *********************************************************************************************

#     Create the File Path for Each Station's .txt File Which Includes Streamflow Data
    input_fle_path_fr_lp = input_fle_path + "/" + statn_data_lst_of_dicts[i]['File_Name']

#     Create the Dataframe to Store the Streamflow Data from the .txt File
    df = pd.DataFrame(columns = clmn_nms)
#     df_nm = "_" + statn_nm + "_df"
# *********************************************************************************************

# *********************************************************************************************
#                           Step 2 Create a Dataframe for the Streamflow
# *********************************************************************************************
# Loop Through the .txt File and Store the Data into a Dataframe
    with open(input_fle_path_fr_lp,'r') as fh:
        for curline in dropwhile(is_comment, fh):
    #         print(f"Index Number: {count} {curline}")
    #         count = count + 1



    # Split a String
    #     - https://www.geeksforgeeks.org/python-string-split/
    # Pandas Series
    #     - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html
            to_append = curline[:-1].split("\t")
            a_series = pd.Series(to_append, index = clmn_nms)

    #             Dataframe
    #             Dataframe Name
            statn_nm = input_fle_nm[:-4]

    #             Append Data to the Dataframe
            df= df.append(a_series, ignore_index=True)
    
    
    
    
#     print(input_fle_nm)
# Delete the first 2 Rows of the Dataframe Because they are not Data
    df = df.drop(index = [0, 1])
# Reset the Index so that it Starts with 0
    df = df.reset_index(drop = True)
# Change the Data Types of Each Column
    df = df.astype(convert_dict) 
# Change the Date Column to a datetime Data Type
    df['date']= pd.to_datetime(df['date'])
    
    avg_strmflw = df["streamflow_rate"].mean(axis = 0)
    print(f"Average Streamflow: {avg_strmflw}")
#     print(df["streamflow_rate"])
#     print("********************************************************************")
    print (df)
    
    count = 0
    print(len(df))
    
    for i_df in range(len(df)):
#         print (df["streamflow_rate"][2])
        print (df["streamflow_rate"][i_df])
#         print (df_row)
        if df["streamflow_rate"][i_df] < avg_strmflw:
            count = count + 1
            
            print ("True")
        elif df["streamflow_rate"][i_df] > avg_strmflw:
            print ("False")
        print("********************************************************************")

    pct_blw_avg = (count / len(df) * 100)
    print (pct_blw_avg)
# Add a value into an empty dictionay element
#     - https://www.pluralsight.com/guides/manipulating-lists-dictionaries-python
    statn_data_lst_of_dicts[i].update({"Data": df})
    statn_data_lst_of_dicts[i].update({"Avg_Streamflow": avg_strmflw})
    statn_data_lst_of_dicts[i].update({"Prcnt_Below_Avg": pct_blw_avg})
#     input_fle_dict["Data"].append(df)

    # df_nm = df_nm.drop(index = [0, 1])
    # "_" + statn_nm + "_df" = df
    # df_nm
    # df
    # df.drop(index = [0, 1])

In [None]:
Next Step, take the total number of streams that are below its 38 year average and create a
precent of that per day (calculate a percent by county then a total precent for the state (Use 
weighted averaging for the county and for the state, so that bigger streams have more weight in the
precent)). This will tell us how many streams are below average per day and we can relate that to 
how many fires were reported that day and how many lightning strikes occured that day.

In [None]:
statn_data_lst_of_dicts

In [None]:
# print("_" + input_fle_nm + "_df")
# test = "_" + input_fle_nm
# test[:-4]
# input_fle_dict["df_Name"]
# _13073000_df.head()
# _13206000_df.drop(index = [0, 1])
df

In [None]:
input_fle_dict["Data"][0].append(df)

In [None]:
input_fle_dict["Data"][0].dtypes

In [None]:
index = 1

In [None]:
input_fle_dict['Station_Nmbr'][index]

In [None]:
input_fle_dict["Data"][index]

# @@@@@@@@@@@@@@@@@@@@@@@

In [None]:
from itertools import dropwhile

input_fle_path = "Data/Idaho_Streamflow_Data/" + input_fle_nm

count = 0 

with open(input_fle_path,'r') as fh:
    for curline in dropwhile(is_comment, fh):
        print(f"Index Number: {count} {curline}")
        count = count + 1
        

In [None]:
curline

In [None]:
# Create Dataframe for the Data

clmn_nms = ["agency", "site_nmbr", "date", "streamflow_rate", "approved/pending"]

_13206000_df = pd.DataFrame(columns = clmn_nms)

_13206000_df

In [None]:
# Split a String
#     - https://www.geeksforgeeks.org/python-string-split/
# Pandas Series
#     - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

print(type(curline))
print(curline)
print((curline.split("\t")))
print(type(curline.split("\t")))

to_append = curline[:-1].split("\t")
a_series = pd.Series(to_append, index = clmn_nms)
_13206000_df = _13206000_df.append(a_series, ignore_index=True)
_13206000_df

# ========================================

# Step 3 - Get the Lighting Data from the National Centers for Enviromental Information (NCEI) National Oceanic and Atmospheric Administration (NOAA) Severe Weather Data Inventory

# ========================================

## Scrape a webpage and create a BeautifulSoup object from the results

# 3.1 Create the Webdriver

In [None]:
# from selenium import webdriver
# from selenium.webdriver.support.ui import Select

url = "https://www.ncdc.noaa.gov/severe-weather/severe-weather-data-inventory"

driver = webdriver.Chrome()
driver.get(url)

In [None]:
# https://stackoverflow.com/questions/12323403/how-do-i-find-an-element-that-contains-specific-text-in-selenium-webdriver-pyth
# https://selenium-python.readthedocs.io/locating-elements.html

driver.find_element_by_xpath("//*[contains(text(), 'Map Search')]").click()

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

year = "2001"


WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//input[@class='esri-input esri-search__input'])[1]"))).send_keys("Idaho, USA")
# time.sleep(5)
WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@class='esri-search__submit-button esri-widget--button'])[1]"))).click()
# time.sleep(10)
# WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@id='yearSelect']/option[text()=" + year + "])"))).click()
# WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@class='custom-select swdi-select']/option[text()=" + dataset + "])"))).click()

# # https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python
# driver.find_element_by_xpath("//select[@id='yearSelect']/option[text()=" + year + "]").click()

In [None]:
WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@id='yearSelect']/option[text()=" + year + "])"))).click()

In [None]:
# References:
#     - Get All the Options in the Dropdown List:
#         - https://www.edureka.co/community/53559/how-get-all-options-dropdown-using-python-selenium-webdriver
#     - Remove a List of Unwanted Characters from a String:
#         - https://www.geeksforgeeks.org/python-removing-unwanted-characters-from-string/


yrs_lghtnng_strks = ["1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015"]
dataset = "Lightning Strikes"

lghtnng_strks_df = pd.DataFrame()
bad_chars = ['(', 'events)']

for yr in yrs_lghtnng_strks:
    WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@id='yearSelect']/option[text()=" + yr + "])"))).click()
    
    time.sleep(5)
    
    select = Select(driver.find_element_by_id("datasetSelect"))
    select.select_by_visible_text(dataset)    
    
    time.sleep(15)
    
    lghtnng_strks = driver.find_element_by_id("dateSelect")
    options = [x for x in lghtnng_strks.find_elements_by_tag_name("option")]
    
    for element in options:
#     print (element.get_attribute("text"))
        text = element.get_attribute("text")

        count = 1
    
        for i in bad_chars:
            text = text.replace(i, "")
            
            if count == 2:
                text = text.replace(i, "")
#                 print (text.split())
#                 print (text.split()[0])
#                 print (text.split()[1])
#                 print ("**************************")
                count = 0

                lghtnng_strks_df = lghtnng_strks_df.append({"date": text.split()[0], 
                                                            "number_of_strikes": text.split()[1]}, ignore_index = True)

            count = 1 + count

In [None]:
test_df = lghtnng_strks_df
test_df.dtypes

In [None]:
# How many seconds of phone calls are recorded in total?
# print(test_df['number_of_strikes'].sum())
# test_df

# test_df.groupby(['month']).groups.keys()

# test_df.groupby([test_df["date"].dt.month]).sum().reset_index()


# Split the String into Just the Year-Month:
#     - https://stackoverflow.com/questions/26646191/pandas-groupby-month-and-year

def getYearMonth(s):
  return s.split("-")[0]+"-"+s.split("-")[1]

test_df['YearMonth']= test_df['date'].apply(lambda x: getYearMonth(x))


In [None]:
print(test_df.dtypes)

# Change the Date Column to a datetime Data Type
# test_df['date']= pd.to_datetime(test_df['date'])
# or
# test_df.astype({'date': 'datetime64'})

# Change the "number_of_strikes" Column to an Integer ("int32") Data Type
test_df = test_df.astype({'number_of_strikes': 'int32'})
print("*******************************************")
print(test_df.dtypes)
test_df

In [None]:
test_YearMonth_df = test_df.groupby("YearMonth")["number_of_strikes"].sum()
test_YearMonth_df = pd.DataFrame(test_YearMonth_df)
test_YearMonth_df = test_YearMonth_df.reset_index()
test_YearMonth_df

In [None]:
# WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@id='datasetSelect'])"))).click()
# WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH, "(//*[@id='datasetSelect']/option[text()=" + dataset + "])"))).click()
# driver.find_element_by_xpath("//select[@id='datasetSelect']/option[text()=" + dataset + "]").click()
# driver.find_element_by_xpath("//*[@id='datasetSelect']/option[text()=" + dataset + "]").click()
# driver.find_element_by_xpath("//*[@id='datasetSelect']").click()

# https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python
select = Select(driver.find_element_by_id("datasetSelect"))

select.select_by_visible_text(dataset)

In [None]:
test_YearMonth_df.describe()

In [None]:
# References:
#     Shading an area between two points in a matplotlib plot:
#         - https://stackoverflow.com/questions/3681872/shading-an-area-between-two-points-in-a-matplotlib-plot


x_axis = np.arange(len(test_YearMonth_df))

plt.figure(figsize = (25,20))
plt.bar(x_axis, test_YearMonth_df["number_of_strikes"])
plt.xticks(x_axis, test_YearMonth_df["YearMonth"], rotation = "vertical")
plt.hlines(10,0,92, alpha = 1, color = "red")
plt.axvspan(0, 3, color='y', alpha=0.4, lw=0) # Highlighting the 1992 Lightning Strikes
plt.axvspan(4, 8, color='g', alpha=0.4, lw=0) # Highlighting the 1993 Lightning Strikes

In [None]:
# # https://www.edureka.co/community/53559/how-get-all-options-dropdown-using-python-selenium-webdriver

# lghtnng_strks = driver.find_element_by_id("dateSelect")

# options = [x for x in lghtnng_strks.find_elements_by_tag_name("option")]

# print(options)

# for element in options:
#     print (element.get_attribute("text").split(" "))

In [None]:
# References:
#     - Get All the Options in the Dropdown List:
#         - https://www.edureka.co/community/53559/how-get-all-options-dropdown-using-python-selenium-webdriver
#     - Remove a List of Unwanted Characters from a String:
#         - https://www.geeksforgeeks.org/python-removing-unwanted-characters-from-string/

lghtnng_strks_df = pd.DataFrame()

lghtnng_strks = driver.find_element_by_id("dateSelect")

options = [x for x in lghtnng_strks.find_elements_by_tag_name("option")]

bad_chars = ['(', 'events)']

for element in options:
#     print (element.get_attribute("text"))
    text = element.get_attribute("text")
    
    count = 1
    
    for i in bad_chars:
        text = text.replace(i, "")
        
        if count == 2:
            text = text.replace(i, "")
            print (text.split())
            print (text.split()[0])
            print (text.split()[1])
            print ("**************************")
            count = 0
            
            lghtnng_strks_df = lghtnng_strks_df.append({"date": text.split()[0], 
                                                        "number_of_strikes": text.split()[1]}, ignore_index = True)
            
        count = 1 + count

In [None]:
lghtnng_strks_df

In [None]:
def select_dropdown_value(year):
    # https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python
    driver.find_element_by_xpath("//select[@id='yearSelect']/option[text()='2001']").click()

In [None]:
# https://pythonspot.com/selenium-textbox/

text_area = driver.find_element_by_class_name('esri-input esri-search__input')
text_area.send_keys("This text is send using Python code.")

In [None]:
# https://stackoverflow.com/questions/52873433/python-selenium-clicking-based-on-alt-attribute

driver.find_element_by_css_selector('[alt="id"]').click()

In [None]:
# Press/Click a Button Without an ID
#     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id

lst_all_statns = '//input[@type="radio" and @value="statelist"]'

button = driver.find_element_by_xpath(lst_all_statns)
button.click()

In [None]:
# https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python

select = Select(driver.find_element_by_id('select_display'))

# Select by visible text
# select.select_by_visible_text('Daily Stage and Streamflow')

# Select by value text
select.select_by_value('dailystagedischarge')

# +++++++++++++++++++++++++++++++++++++++

# ========================================

# Step 3 - Get the Mean Streamflow Rate for Each Station

# ========================================

## Scrape a webpage and create a BeautifulSoup object from the results

## 3.1 USGS' Science for a Changing World

In [None]:
# input_fle_dict = {"Station_Nmbr":[], "File_Name": [], "df_Name": [], "Data": [], "Avg_Streamflow": []}

input_fle_dict["Data"][1]

In [None]:
# input_fle_dict["Data"][0]["streamflow_rate"].mean(axis = 0)
statn_data_lst_of_dicts[0]["Data"]["streamflow_rate"].mean(axis = 0)

In [None]:
input_fle_dict["Station_Nmbr"]

In [None]:
# Find the index of a dictionary within a list (I modified this codes since my dictionary isn't in a list)
#     - https://stackoverflow.com/questions/4391697/find-the-index-of-a-dict-within-a-list-by-matching-the-dicts-value

def find_avg (lst, key, value):
    for i, a_dict_2 in enumerate(lst):
        print(a_dict_2[key])
    #         print(input_fle_dict["Station_Nmbr"])
    #         print("********************************************************************")
#         if a_dict_2[key] == value:    
#         if station == value:
#             print(i)
    
                
    
#             avg_strmflw_rte = dict_nm["Data"][i]["streamflow_rate"].mean(axis = 0)
#             dict_nm["Avg_Streamflow"][i].append(avg_strmflw_rte )

#             avg_strmflw = lst[i]["Data"]["streamflow_rate"].mean(axis = 0)
#             lst[i].update({"Avg_Streamflow": avg_strmflw})
        
#             return i # avg_strmflw_rte
    #             print("********************************************************************")
        return -1

In [None]:
input_fle_dict["Station_Nmbr"][0].append(10)

In [None]:
find_avg(statn_data_lst_of_dicts, "Station_Nmbr", "13206000")

In [None]:
lst = [{'id':'1234','name':'Jason'}, {'id':'2345','name':'Tom'}, {'id':'3456','name':'Art'}]

tom_index = next((index for (index, d) in enumerate(lst) if d["name"] == "Tom"), None)
tom_index

# tom_index = next((index for (index, d) in enumerate(input_fle_dict) if d["Station_Nmbr"] == 13075910), None)
# print(tom_index)

In [None]:
types1 = [type(k) for k in input_fle_dict["Station_Nmbr"]]
types1
# type(13075000)

In [None]:
# dicts = [{'id':'1234','name':'Jason'},
#          {'id':'2345','name':'Tom'},
#          {'id':'3456','name':'Art'}]

def find_index(dicts, key, value):
    class Null: pass
    for i, d in enumerate(dicts):
        if d.get(key, Null) == value:
            return d
    else:
        raise ValueError('no dict with the key and value combination found')

print (find_index(dicts, 'name', 'Tom'))
# 1
# find_index(dicts, 'name', 'Ensnare')
# ValueError: no dict with the key and value combination found

In [None]:
dicts

In [None]:
def find(lst, key, value):
#     i = 0
    for i, dic in enumerate(lst):
        print(lst)
        print(dic)
        print("********************************************************************")
        if dic[key] == value:
            return i
            print("********************************************************************")
    return -1


In [None]:
find(lst, "name", "Tom")

In [None]:
# Using list comprehension + enumerate() 
# Key index in Dictionary 
search_key = "13075000"

temp = list(input_fle_dict.items())  Station_Nmbr
res = list(input_fle_dict.keys()).index(search_key) 
res

In [None]:
for input_fle_nm in input_fle_dict["File_Name"]:
    
# Using list comprehension + enumerate() 
# Key index in Dictionary 
    temp = list(test_dict.items())  
    res = [idx for idx, key in enumerate(temp) if key[0] == search_key] 
    
    
    input_fle_dict["Data"][0]["streamflow_rate"] = input_fle_dict["Data"][0]["streamflow_rate"].mean(axis = 0)

# +++++++++++++++++++++++++++++++++++++++

In [None]:
file = open("Data/Idaho_Streamflow_Data/13206000.txt", "r")
lines = file.readlines()[26:]

print(type(lines))
print(lines)

pd.DataFrame(lines)

In [None]:
# Delete a Row from the List
#     - https://note.nkmk.me/en/python-list-clear-pop-remove-del/#:~:text=In%20Python%2C%20use%20list%20methods,with%20an%20index%20or%20slice.

del lines[0:1]
lines
# print((lines[1].split("\t")))
test = lines[1][:-1].split("\t")
test

a_series = pd.Series(test, index = clmn_nms)
_13206000_df = _13206000_df.append(a_series, ignore_index=True)
_13206000_df

In [None]:
# Read in the Text File and Convert to a Dataframe
data = pd.read_csv('Data/Idaho_Streamflow_Data/13206000.txt')
data

In [None]:
# Drop the Empty Rows
import numpy as np
np.where(pd.isnull(statn_table_df_splt_StatnNmbr_nmbrs_clmn))
# statn_table_df_splt_StatnNmbr_nmbrs_clmn.isnull()

In [None]:
statn_table_df["StationNumber"] = statn_table_df["StationNumber"].astype(str)
print(statn_table.dtypes)

In [None]:
    if "County" in statn_table["StationNumber"][0]:
        print("true")
        statn_table_df_drp_cnty = statn_table_df.drop([0, 4], axis = 0)
statn_table_df_drp_cnty

In [None]:
# Delete the County Names from the Table
# test = statn_table.drop([0, 4], axis = 0)
# test.head()

# Count the number of Rows in the Dataframe
count = 0
for statn_table_df_row in statn_table_df.index:

    if "County" in statn_table_df["StationNumber"][statn_table_df_row]:
        statn_table_df_drp_cnty = statn_table_df.drop([statn_table_df_row], axis = 0)
        count = count + 1

print(count)


In [None]:
statn_table_df_drp_cnty.head(15)
# statn_table_df_drp_cnty.tail(15)

In [None]:
print(statn_table_df_row)

In [None]:
statn_table_df["StationNumber"].value_counts()

In [None]:
# Grab All Page Source on the Page
soup_lxml = BS(driver.page_source, "lxml")

# Find All the Tables on the Page
tables = soup_lxml.find_all("table")
tables

In [None]:
# Read the Tables with Pandas
dfs = pd.read_html(str(tables))

In [None]:
# Access the Table
print(f"Number of Tables on the page: {len(dfs)}")
print("*********************************************************************************************************")
print(f"Data Types for the Table: {dfs[11].dtypes}")
print("*********************************************************************************************************")
print(f"Number of Rows in the dataframe: {len(dfs[11])}")
dfs[11].head()

In [None]:
print(dfs[11]["USGSstationnumber"].value_counts())

In [None]:
# Press/Click a Button Without an ID
#     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id

# Select(driver.find_element_by_id('rdb'))

tab_sprtd_rado = '//input[@type="radio" and @value="rdb"]'

button = driver.find_element_by_xpath(tab_sprtd_rado)
button.click()

In [None]:
# identify location of chromedriver and store it as a variable
chromedriver = !which chromedriver
print(type(chromedriver))
chromedriver[0]

### 2.1.1 Retrieve the data/information on USGS' WaterWatch website

In [None]:
# Retrieve page with the requests module
executable_path = {"executable_path": "chromedriver"}
# OR
# executable_path = {"executable_path": chromedriver[0]}
# I am not sure why the above works and the below statement will not. I think it's b/c chromebriver is a class 'IPython.utils.text.SList'?
# executable_path = {"executable_path": chromedriver}

browser = Browser('chrome', **executable_path, headless = False)

In [None]:
# URL of page to be scraped
url = "https://waterwatch.usgs.gov/index.php?id=wwdrought"
# url = "https://www.usgs.gov/"
browser.visit(url)
window = browser.windows.current

In [None]:
html = browser.html
soup = BS(html, "html.parser")

In [None]:
type(soup)

In [None]:
# Print the html code of the NASA's Mars website
print(soup.prettify())

In [None]:
# https://stackoverflow.com/questions/19392466/python-beautifulsoup-get-select-value-not-text

for option in soup.find_all('option'):
    print(option)

In [None]:
# https://stackoverflow.com/questions/7867537/how-to-select-a-drop-down-menu-value-with-selenium-using-python

select = Select(driver.find_element_by_id('st'))

# Select by visible text
select.select_by_visible_text('Idaho')

In [None]:
# Fill in Input Fills
#     - https://stackoverflow.com/questions/25537567/how-to-open-website-and-fill-in-input-using-selenium-webdriver
# Clear the Input Field
#     - http://10minbasics.com/clear-fill-input-field-with-selenium/

element = driver.find_element_by_name("bdt")
element.clear()
element.send_keys("1990-01-01")

element = driver.find_element_by_name("edt")
element.clear()
element.send_keys("1990-12-31")

In [None]:
# Press/Click a Button Without an ID
#     - https://stackoverflow.com/questions/8871654/how-to-press-click-the-button-using-selenium-if-the-button-does-not-have-the-id

NEXT_BUTTON_XPATH = '//input[@type="submit" and @value="GO"]'

button = driver.find_element_by_xpath(NEXT_BUTTON_XPATH)
button.click()

In [None]:
# https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class

# test = soup.find_all("div", class_= "ztable")
# test

# https://stackoverflow.com/questions/20522820/how-to-get-tbody-from-table-from-python-beautiful-soup
soup.findAll('table')[0].findAll('tr')

In [None]:
# Grab All Page Source on the Page
soup_lxml = BS(driver.page_source, "lxml")

# Find All the Tables on the Page
tables = soup_lxml.find_all("table")
tables

In [None]:
# Read the Tables with Pandas
dfs = pd.read_html(str(tables))

In [None]:
# Access the Table
print(f"Number of Tables on the page: {len(dfs)}")
print("*********************************************************************************************************")
print(f"Data Types for the Table: {dfs[11].dtypes}")
print("*********************************************************************************************************")
print(f"Number of Rows in the dataframe: {len(dfs[11])}")
dfs[11].head()

In [None]:
print(dfs[11]["USGSstationnumber"].value_counts())

In [None]:
import requests, json
text = requests.get("https://waterwatch.usgs.gov/index.php?id=wwdrought").text
data = json.loads(text)
print(data['Scty'])

In [None]:
for tr in soup.find_all('tr')[2:]:
    tds = tr.find_all('td')
    print (tds)#"Nome: %s, Cognome: %s, Email: %s" % \
#           (tds[0].text, tds[1].text, tds[2].text)

# ========================================

# Step 1 - Scraping

# ========================================

# ========================================

# Step 1 - Scraping

# ========================================

### 1.1.2 Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

#### 1.1.2.1 Collect the latest News Title