In [2]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import requests
from sklearn.linear_model import LinearRegression

In [3]:
# Global Init Variables
GET_TEMPERATURES = True
GET_HUMIDITY = True
GET_WEATHER = True

In [4]:
# read both observation files
true_observations_df = pd.read_json('../INaturalist_Scraper/Data/true_observations.json')
false_observations_df = pd.read_json('../INaturalist_Scraper/Data/false_observations.json')

# add powdery mildew bool feature for each collection
true_observations_df['PowderyMildew'] = True
false_observations_df['PowderyMildew'] = False

# concat collections
observations_df = pd.concat([true_observations_df, false_observations_df], ignore_index=True)

# Drop rows with missing values
observations_df = observations_df.dropna()

# remove date, we date_string is better
observations_df.drop('date', axis=1, inplace=True)

In [None]:
# Function to parse date string and extract components
def extract_date_components(date_string):
    import datetime
    # Parse ISO format date string
    dt = datetime.datetime.fromisoformat(date_string.replace('Z', '+00:00'))
    return dt.year, dt.month, dt.day

# Extract relevant data for true observations
lons = []
lats = []
years = []
months = []
days = []

for _, row in observations_df.iterrows():
    lons.append(row['coordinates'][0])
    lats.append(row['coordinates'][1])
    year, month, day = extract_date_components(row['date_string'])
    years.append(year)
    months.append(month)
    days.append(day)
    
observations_df['longitude'] = lons
observations_df['latitude'] = lats
observations_df['year'] = years
observations_df['month'] = months
observations_df['day'] = days

observations_df.sample(10)


In [6]:
# Functions to fetch weather data
def get_temps(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    temp_dict = {}
    parameters = {
        "parameters": "T2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for key, temperature in data['properties']['parameter']['T2M'].items():
            data['properties']['parameter']['T2M'][key] = '{0:.2f}'.format((temperature * (9/5)) + 32) # convert to Fahrenheit
    except:
        print(f"Error getting temperature data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return data['properties']['parameter']['T2M']


In [7]:
def get_humidity(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    humidity_dict = {}
    parameters = {
        "parameters": "RH2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for key, humidity in data['properties']['parameter']['RH2M'].items():
            data['properties']['parameter']['RH2M'][key] = '{0:.2f}'.format(humidity)
    except:
        print(f"Error getting humidity data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return data['properties']['parameter']['RH2M']


In [None]:
def get_general_weather(lon, lat, day, month, year, param_list):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    parameters = {
        "parameters": ",".join(param_list),
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }
    curr_data = {}

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for param in param_list:
            curr_data[param] = data['properties']['parameter'][param]
    except:
        print(f"Error getting data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return curr_data

In [None]:
# Add temperature data to dataframe and save 

if GET_TEMPERATURES:
    temp_data = []
    temps_df = pd.DataFrame(columns=['Datetime', 'Temperature', 'ID'])
    
    for i, row in observations_df.iterrows():
        temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        # if invalid data, try 4 more times to get data, if fail, skip
        if temp is None:
            for j in range(3):
                temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
                if temp is not None:
                    break
        if temp is None:
            continue
        dates = list(temp.keys())
        temperature_values = list(temp.values())
        id_vals = [row['id']] * len(dates)
        
        temp_df = pd.DataFrame({
            'Datetime': dates,
            'Temperature': temperature_values,
            'ID': id_vals
        })
        
        temps_df = pd.concat([temps_df, temp_df], ignore_index=True)
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")
    
    temps_df.to_csv('temps.csv', index=False)

    # Save to CSV file
    observations_df.to_csv('observations.csv', index=False)
    print(f"Saved processed dataframe with {len(observations_df)} observations to observations.csv")
    


In [None]:
if GET_HUMIDITY:
    humidity_data = []
    humidities_df = pd.DataFrame(columns=['Datetime', 'Humidity', 'ID'])
    
    for i, row in observations_df.iterrows():
        humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        # if invalid data, try 4 more times to get data, if fail, skip
        if humidity is None:
            for j in range(3):
                humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
                if humidity is not None:
                    break
        if humidity is None:
            continue
        dates = list(humidity.keys())
        humidity_values = list(humidity.values())
        id_vals = [row['id']] * len(dates)
        humidity_df = pd.DataFrame({
            'Datetime': dates,
            'Humidity': humidity_values,
            'ID': id_vals
        })
        humidities_df = pd.concat([humidities_df, humidity_df], ignore_index=True)
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")
    
    humidities_df.to_csv('humidity_data.csv', index=False)


In [None]:
"""
T2M	Temperature at 2 Meters
RH2M	Relative Humidity at 2 Meters
WS2M	Wind Speed at 2 Meters
T2MDEW	Dew/Frost Point at 2 Meters
T2MWET	Wet Bulb Temperature at 2 Meters
QV2M	Specific Humidity at 2 Meters
"""
data = []
dataframe = pd.DataFrame(columns=['Datetime','ID', 'Temperature', 'Humidity', 'Wind Speed', 'Dew/Frost Point', 'Wet Bulb Temperature', 'Specific Humidity'])

for i, row in observations_df.iterrows():
    weather_data = get_general_weather(row['longitude'], row['latitude'], row['day'], row['month'], row['year'], ['T2M', 'RH2M', 'WS2M', 'T2MDEW', 'T2MWET', 'QV2M'])
    if weather_data is None:
        for j in range(3):
            weather_data = get_general_weather(row['longitude'], row['latitude'], row['day'], row['month'], row['year'], ['T2M', 'RH2M', 'WS2M', 'T2MDEW', 'T2MWET', 'QV2M'])
            if weather_data is not None:
                break
    if weather_data is None:    
        continue
    dates = list(weather_data['T2M'].keys())
    id_vals = [row['id']] * len(dates)
    temp_values = list(weather_data['T2M'].values())
    humidity_values = list(weather_data['RH2M'].values())
    wind_speed_values = list(weather_data['WS2M'].values())
    dew_frost_values = list(weather_data['T2MDEW'].values())
    wet_bulb_values = list(weather_data['T2MWET'].values())
    specific_humidity_values = list(weather_data['QV2M'].values())
    
    curr_df = pd.DataFrame({
        'Datetime': dates,
        'ID': id_vals,
        'Temperature': temp_values,
        'Humidity': humidity_values,
        'Wind Speed': wind_speed_values,
        'Dew/Frost Point': dew_frost_values,
        'Wet Bulb Temperature': wet_bulb_values,
        'Specific Humidity': specific_humidity_values
    })
    
    dataframe = pd.concat([dataframe, curr_df], ignore_index=True)
    # Print progress every 10 rows
    if i % 10 == 0:
        print(f"Processed {i}")
    
dataframe.to_csv('weather_data.csv', index=False)