# Data collection

## Species richness

The cell below takes the geospatial map image of species richness over the United States and allows for manual input of overall state species richness in a tabular format. 

In [1]:
import tkinter as tk
from tkinter import ttk
import pandas as pd

class StateDataCollector:
    def __init__(self, root):
        self.root = root
        self.root.title("State Species Richness Data Collector")
        
        # Dictionary to store state:value pairs
        self.data = {}
        
        # Create the input frame
        input_frame = ttk.Frame(root, padding="10")
        input_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
        
        # Create state selector
        self.states = sorted([
            'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
            'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
            'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
            'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
            'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
            'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
            'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
            'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
            'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
            'West Virginia', 'Wisconsin', 'Wyoming'
        ])
        
        self.state_var = tk.StringVar()
        state_label = ttk.Label(input_frame, text="State:")
        state_label.grid(row=0, column=0, padx=5, pady=5)
        state_combo = ttk.Combobox(input_frame, textvariable=self.state_var, values=self.states)
        state_combo.grid(row=0, column=1, padx=5, pady=5)
        
        # Create value input
        self.value_var = tk.StringVar()
        value_label = ttk.Label(input_frame, text="Species Richness (0-15):")
        value_label.grid(row=1, column=0, padx=5, pady=5)
        value_entry = ttk.Entry(input_frame, textvariable=self.value_var)
        value_entry.grid(row=1, column=1, padx=5, pady=5)
        
        # Create buttons
        button_frame = ttk.Frame(input_frame)
        button_frame.grid(row=2, column=0, columnspan=2, pady=10)
        
        add_button = ttk.Button(button_frame, text="Add Entry", command=self.add_entry)
        add_button.grid(row=0, column=0, padx=5)
        
        save_button = ttk.Button(button_frame, text="Save to CSV", command=self.save_data)
        save_button.grid(row=0, column=1, padx=5)
        
        # Create display area
        self.display = tk.Text(input_frame, height=10, width=40)
        self.display.grid(row=3, column=0, columnspan=2, pady=10)
        
    def add_entry(self):
        state = self.state_var.get()
        value = self.value_var.get()
        
        if state and value:
            try:
                value = int(value)
                if 0 <= value <= 15:
                    self.data[state] = value
                    self.update_display()
                    # Clear inputs
                    self.state_var.set('')
                    self.value_var.set('')
                else:
                    self.display.insert(tk.END, "Error: Value must be between 0 and 15\n")
            except ValueError:
                self.display.insert(tk.END, "Error: Please enter a valid number\n")
    
    def update_display(self):
        self.display.delete(1.0, tk.END)
        for state, value in sorted(self.data.items()):
            self.display.insert(tk.END, f"{state}: {value}\n")
    
    def save_data(self):
        df = pd.DataFrame.from_dict(self.data, orient='index', columns=['species_richness'])
        df.index.name = 'state'
        df.to_csv('../data/raw_data/species_richness_by_state.csv')
        self.display.insert(tk.END, "\nData saved to species_richness_by_state.csv\n")

# Create and run the application
if __name__ == "__main__":
    root = tk.Tk()
    app = StateDataCollector(root)
    root.mainloop()

Alabama: 10
Arizona: 5
Arkansas: 8
California: 10
Colorado: 2
Connecticut: 5
Delaware: 5
Florida: 9
Georgia: 4
Idaho: 1
Illinois: 6
Indiana: 8
Iowa: 5
Kansas: 1
Kentucky: 9
Louisiana: 10
Maine: 2
Maryland: 6
Massachusetts: 4
Michigan: 3
Minnesota: 3
Mississippi: 8
Missouri: 7
Montana: 0
Nebraska: 1
Nevada: 2
New Hampshire: 4
New Jersey: 6
New Mexico: 1
New York: 9
North Carolina: 5
North Dakota: 1
Ohio: 4
Oklahoma: 2
Oregon: 2
Pennsylvania: 6
Rhode Island: 4
South Carolina: 7
South Dakota: 0
Tennessee: 11
Texas: 3
Utah: 3
Vermont: 4
Virginia: 10
Washington: 2
West Virginia: 10
Wisconsin: 6
Wyoming: 0

In [1]:
token = "YAvQVopzQifTCqEDAYkIsFexHiCisBIB"

In [2]:
# import pandas as pd
# import requests
# import time
# import logging
# from datetime import datetime
# from typing import Dict, List, Optional

# logging.basicConfig(
#     level=logging.INFO,
#     format='%(asctime)s - %(levelname)s - %(message)s'
# )

# class NOAAWeatherCollector:
#     def __init__(self, token: str, start_year: int = 2020, end_year: int = 2022):
#         self.base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2"
#         self.headers = {"token": token}
#         self.start_year = start_year
#         self.end_year = end_year

#     def get_weather_data(self, station_id: str, state: str) -> List[Dict]:
#         """Get weather data for a station"""
#         all_data = []
        
#         for year in range(self.start_year, self.end_year + 1):
#             logging.info(f"Fetching {year} data for station {station_id}")
            
#             params = {
#                 "datasetid": "GHCND",
#                 "stationid": station_id,
#                 "startdate": f"{year}-01-01",
#                 "enddate": f"{year}-12-31",
#                 "limit": 1000,
#                 "datatypeid": "TMAX,TMIN,PRCP"  # Temperature max/min and precipitation
#             }
            
#             try:
#                 response = requests.get(
#                     f"{self.base_url}/data",
#                     headers=self.headers,
#                     params=params
#                 )
                
#                 if response.status_code == 200:
#                     data = response.json()
#                     if 'results' in data:
#                         for record in data['results']:
#                             all_data.append({
#                                 'station_id': station_id,
#                                 'state': state,
#                                 'date': record['date'],
#                                 'datatype': record['datatype'],
#                                 'value': record['value']
#                             })
#                 elif response.status_code == 429:  # Rate limit
#                     wait_time = int(response.headers.get('Retry-After', 60))
#                     logging.warning(f"Rate limit hit. Waiting {wait_time} seconds...")
#                     time.sleep(wait_time)
#                 else:
#                     logging.error(f"Error fetching data: {response.status_code}")
                    
#             except Exception as e:
#                 logging.error(f"Error processing station {station_id}: {str(e)}")
            
#             time.sleep(1)  # Prevent rate limiting
            
#         return all_data

# def process_weather_data(df: pd.DataFrame) -> pd.DataFrame:
#     """Process weather data into yearly averages by state"""
#     # Ensure required columns exist
#     required_columns = ['date', 'datatype', 'value', 'state']
#     for col in required_columns:
#         if col not in df.columns:
#             raise KeyError(f"Missing required column: {col}")
    
#     # Convert date to datetime
#     df['date'] = pd.to_datetime(df['date'], errors='coerce')
#     if df['date'].isnull().any():
#         raise ValueError("Some 'date' values could not be converted to datetime.")
    
#     df['year'] = df['date'].dt.year
    
#     # Convert values (temperature in tenths of degrees C, precipitation in tenths of mm)
#     df['value'] = pd.to_numeric(df['value'], errors='coerce')
#     df.loc[df['datatype'].isin(['TMAX', 'TMIN']), 'value'] = df.loc[df['datatype'].isin(['TMAX', 'TMIN']), 'value'] / 10
#     df.loc[df['datatype'] == 'PRCP', 'value'] = df.loc[df['datatype'] == 'PRCP', 'value'] / 10
    
#     # Check for invalid or missing data
#     if df.isnull().any().any():
#         logging.warning("DataFrame contains missing or invalid values. Dropping rows...")
#         df = df.dropna()
    
#     # Calculate yearly averages by state
#     grouped = df.pivot_table(
#         index=['state', 'year'],
#         columns='datatype',
#         values='value',
#         aggfunc='mean'
#     ).reset_index()
    
#     # Rename columns
#     grouped.columns = ['state', 'year', 'avg_max_temp_c', 'avg_min_temp_c', 'total_precip_mm']
    
#     return grouped

# def main():
#     # Read stations from CSV
#     stations_df = pd.read_csv('us_weather_stations.csv')
    
#     # Initialize collector
#     token = "YAvQVopzQifTCqEDAYkIsFexHiCisBIB"
#     collector = NOAAWeatherCollector(token=token)
    
#     all_weather_data = []
    
#     # Process a sample of stations first (2 per state)
#     sample_stations = stations_df.groupby('state').head(2)
    
#     for _, station in sample_stations.iterrows():
#         try:
#             station_data = collector.get_weather_data(
#                 station_id=station['id'],
#                 state=station['state']
#             )
#             all_weather_data.extend(station_data)
#             logging.info(f"Collected {len(station_data)} records for {station['id']}")
            
#         except Exception as e:
#             logging.error(f"Error processing station {station['id']}: {str(e)}")
#             continue
    
#     if all_weather_data:
#         # Convert to DataFrame
#         weather_df = pd.DataFrame(all_weather_data)
        
#         # Save raw data
#         weather_df.to_csv('weather_data_raw.csv', index=False)
#         logging.info("Saved raw weather data")
        
#         # Process and save aggregated data
#         state_yearly_df = process_weather_data(weather_df)
#         state_yearly_df.to_csv('weather_data_by_state.csv', index=False)
#         logging.info("Saved processed weather data")
        
#         # Display sample of results
#         print("\nSample of processed weather data:")
#         print(state_yearly_df.head())
#     else:
#         logging.error("No weather data collected")

# if __name__ == "__main__":
#     try:
#         main()
#     except Exception as e:
#         logging.error(f"An error occurred: {str(e)}")
#         raise

In [3]:
# import requests
# import csv
# from bs4 import BeautifulSoup

# # Define the URL and parameters
# url = "https://www.ncei.noaa.gov/access/monitoring/climate-at-a-glance/statewide/time-series"
# params = {
#     "dataType": "avgTemp",
#     "area": "statewide",
#     "time_scale": "monthly",
#     "month": "6",
#     "state": "alabama",
#     "year": "2002"
# }

# def state_code_to_state(code):
#     # Mapping of state codes to state names
#     state_map = {
#         1: "alabama", 2: "alaska", 3: "arizona", 4: "arkansas", 5: "california",
#         6: "colorado", 7: "connecticut", 8: "delaware", 9: "florida", 10: "georgia",
#         11: "hawaii", 12: "idaho", 13: "illinois", 14: "indiana", 15: "iowa",
#         16: "kansas", 17: "kentucky", 18: "louisiana", 19: "maine", 20: "maryland",
#         21: "massachusetts", 22: "michigan", 23: "minnesota", 24: "mississippi", 25: "missouri",
#         26: "montana", 27: "nebraska", 28: "nevada", 29: "new-hampshire", 30: "new-jersey",
#         31: "new-mexico", 32: "new-york", 33: "north-carolina", 34: "north-dakota", 35: "ohio",
#         36: "oklahoma", 37: "oregon", 38: "pennsylvania", 39: "rhode-island", 40: "south-carolina",
#         41: "south-dakota", 42: "tennessee", 43: "texas", 44: "utah", 45: "vermont",
#         46: "virginia", 47: "washington", 48: "west-virginia", 49: "wisconsin", 50: "wyoming"
#     }
#     return state_map[code]

# # Create an empty list to store the data
# data = []

# # Loop through all 50 states
# for state_code in range(1, 51):
#     params["state"] = state_code_to_state(state_code)
    
#     # Loop through the weather parameters
#     for param in ["avgTemp", "minTemp", "precip", "cooling-degree-days"]:
#         params["dataType"] = param
        
#         # Loop through the years from 2002 to 2022
#         for year in range(2002, 2023):
#             params["year"] = year
            
#             # Make the request and parse the HTML
#             response = requests.get(url, params=params, timeout=60)
#             soup = BeautifulSoup(response.content, "html.parser")
            
#             # Find the relevant data elements
#             state_name = soup.find("h2").text.split(":")[0]
#             value = float(soup.find("td", {"class": f"{param.replace('-', '-')}"}).text)
            
#             # Add the data to the list
#             data.append({
#                 "State": state_name,
#                 "Year": year,
#                 "Month": "June",
#                 f"{param.title().replace('-', '_')}_june_value": value
#             })

# # Write the data to a CSV file
# with open("climate_data.csv", "w", newline="") as csvfile:
#     fieldnames = ["State", "Year", "Month", "Avg_temp_june_value", "Min_temp_june_value", "Precipitation_june_value", "Avg_cooling_degree_days_june"]
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

#     writer.writeheader()
#     for row in data:
#         writer.writerow(row)



In [4]:
# import requests
# import csv
# from bs4 import BeautifulSoup

# # Define the URL and parameters
# url = "https://www.ncei.noaa.gov/access/monitoring/climate-at-a-glance/statewide/time-series"
# params = {
#     "dataType": "avgTemp",
#     "area": "statewide",
#     "time_scale": "monthly",
#     "month": "6",
#     "state": "alabama",
#     "year": "2002"
# }

# def state_code_to_state(code):
#     # Mapping of state codes to state names
#     state_map = {
#         1: "alabama", 2: "alaska", 3: "arizona", 4: "arkansas", 5: "california",
#         6: "colorado", 7: "connecticut", 8: "delaware", 9: "florida", 10: "georgia",
#         11: "hawaii", 12: "idaho", 13: "illinois", 14: "indiana", 15: "iowa",
#         16: "kansas", 17: "kentucky", 18: "louisiana", 19: "maine", 20: "maryland",
#         21: "massachusetts", 22: "michigan", 23: "minnesota", 24: "mississippi", 25: "missouri",
#         26: "montana", 27: "nebraska", 28: "nevada", 29: "new-hampshire", 30: "new-jersey",
#         31: "new-mexico", 32: "new-york", 33: "north-carolina", 34: "north-dakota", 35: "ohio",
#         36: "oklahoma", 37: "oregon", 38: "pennsylvania", 39: "rhode-island", 40: "south-carolina",
#         41: "south-dakota", 42: "tennessee", 43: "texas", 44: "utah", 45: "vermont",
#         46: "virginia", 47: "washington", 48: "west-virginia", 49: "wisconsin", 50: "wyoming"
#     }
#     return state_map[code]

# # Create an empty list to store the data
# data = []

# # Loop through all 50 states
# for state_code in range(1, 51):
#     params["state"] = state_code_to_state(state_code)
    
#     # Loop through the weather parameters
#     for param in ["avgTemp", "minTemp", "precip", "cooling-degree-days"]:
#         params["dataType"] = param
        
#         # Loop through the years from 2002 to 2022
#         for year in range(2002, 2023):
#             params["year"] = year
            
#             # Make the request and parse the HTML
#             response = requests.get(url, params=params)
#             soup = BeautifulSoup(response.content, "html.parser")
            
#             # Find the relevant data elements
#             state_name = soup.find("h2")
#             if state_name:
#                 state_name = state_name.text.split(":")[0]
#             else:
#                 state_name = "Unknown"
            
#             value_element = soup.find("td", {"class": f"{param.replace('-', '-')}"})
#             if value_element:
#                 value = float(value_element.text)
#             else:
#                 value = float("nan")
            
#             # Add the data to the list
#             data.append({
#                 "State": state_name,
#                 "Year": year,
#                 "Month": "June",
#                 f"{param.title().replace('-', '_')}_june_value": value
#             })

# # Write the data to a CSV file
# with open("climate_data.csv", "w", newline="") as csvfile:
#     fieldnames = ["State", "Year", "Month", "Avg_temp_june_value", "Min_temp_june_value", "Precipitation_june_value", "Avg_cooling_degree_days_june"]
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

#     writer.writeheader()
#     for row in data:
#         writer.writerow(row)
