In [2]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import requests
from sklearn.linear_model import LinearRegression

In [3]:
# Global Init Variables
GET_TEMPERATURES = True
GET_HUMIDITY = True


In [4]:
# read both observation files
true_observations_df = pd.read_json('../INaturalist_Scraper/Data/true_observations.json')
false_observations_df = pd.read_json('../INaturalist_Scraper/Data/false_observations.json')

# add powdery mildew bool feature for each collection
true_observations_df['PowderyMildew'] = True
false_observations_df['PowderyMildew'] = False

# concat collections
observations_df = pd.concat([true_observations_df, false_observations_df], ignore_index=True)

# Drop rows with missing values
observations_df = observations_df.dropna()

# remove date, we date_string is better
observations_df.drop('date', axis=1, inplace=True)

In [5]:
# Function to parse date string and extract components
def extract_date_components(date_string):
    import datetime
    # Parse ISO format date string
    dt = datetime.datetime.fromisoformat(date_string.replace('Z', '+00:00'))
    return dt.year, dt.month, dt.day

# Extract relevant data for true observations
lons = []
lats = []
years = []
months = []
days = []

for _, row in observations_df.iterrows():
    lons.append(row['coordinates'][0])
    lats.append(row['coordinates'][1])
    year, month, day = extract_date_components(row['date_string'])
    years.append(year)
    months.append(month)
    days.append(day)
    
observations_df['longitude'] = lons
observations_df['latitude'] = lats
observations_df['year'] = years
observations_df['month'] = months
observations_df['day'] = days

observations_df.sample(10)


Unnamed: 0,id,date_string,coordinates,PowderyMildew,longitude,latitude,year,month,day
306,186009057,2023-10-03T15:48:00+13:00,"[172.642075, -43.6010138889]",False,172.642075,-43.601014,2023,10,3
173,210921745,2024-04-28T14:58:54+01:00,"[-0.3670084382, 53.7289734935]",False,-0.367008,53.728973,2024,4,28
180,18047775,2018-11-03T14:52:00+13:00,"[169.4713824526, -46.5434026055]",False,169.471382,-46.543403,2018,11,3
73,188080439,2023-10-18T10:36:00+02:00,"[6.9456797552000005, 46.9988705082]",True,6.94568,46.998871,2023,10,18
28,116868714,2022-05-13T14:52:16-04:00,"[-73.95166084, 40.7702097249]",True,-73.951661,40.77021,2022,5,13
57,159198970,2023-05-02T15:09:00+02:00,"[15.8573016667, 46.8729116667]",True,15.857302,46.872912,2023,5,2
317,77564096,2021-05-03T12:31:00+12:00,"[172.64147615, -43.5878140167]",False,172.641476,-43.587814,2021,5,3
99,42241980,2020-04-13T16:18:21+02:00,"[-1.6724583333, 55.1538805]",True,-1.672458,55.15388,2020,4,13
295,213368300,2024-04-28T18:41:00+12:00,"[172.63369145, -43.5797742667]",False,172.633691,-43.579774,2024,4,28
59,45885758,2020-05-14T11:56:29-04:00,"[-73.9512248152, 40.7708008262]",True,-73.951225,40.770801,2020,5,14


In [6]:
# Functions to fetch weather data
def get_temps(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    temp_dict = {}
    parameters = {
        "parameters": "T2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for key, temperature in data['properties']['parameter']['T2M'].items():
            data['properties']['parameter']['T2M'][key] = '{0:.2f}'.format((temperature * (9/5)) + 32) # convert to Fahrenheit
    except:
        print(f"Error getting temperature data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return data['properties']['parameter']['T2M']


In [7]:
def get_humidity(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    humidity_dict = {}
    parameters = {
        "parameters": "RH2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for key, humidity in data['properties']['parameter']['RH2M'].items():
            data['properties']['parameter']['RH2M'][key] = '{0:.2f}'.format(humidity)
    except:
        print(f"Error getting humidity data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return data['properties']['parameter']['RH2M']


In [11]:
def get_general_weather(lon, lat, day, month, year, param_list):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    parameters = {
        "parameters": ",".join(param_list),
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }
    curr_data = {}

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for param in param_list:
            curr_data[param] = data['properties']['parameter'][param]
    except:
        print(f"Error getting data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return curr_data

{'T2M': {'2019122500': 3.51, '2019122501': 4.07, '2019122502': 4.73, '2019122503': 5.26, '2019122504': 5.65, '2019122505': 5.89, '2019122506': 6.23, '2019122507': 6.53, '2019122508': 7.22, '2019122509': 8.25, '2019122510': 9.46, '2019122511': 10.54, '2019122512': 10.96, '2019122513': 10.78, '2019122514': 10.44, '2019122515': 9.42, '2019122516': 8.08, '2019122517': 7.56, '2019122518': 7.32, '2019122519': 6.96, '2019122520': 6.66, '2019122521': 6.33, '2019122522': 6.1, '2019122523': 5.9, '2019122600': 5.65, '2019122601': 5.01, '2019122602': 4.0, '2019122603': 3.42, '2019122604': 3.47, '2019122605': 3.29, '2019122606': 2.53, '2019122607': 1.56, '2019122608': 3.79, '2019122609': 5.7, '2019122610': 7.83, '2019122611': 9.99, '2019122612': 11.39, '2019122613': 11.82, '2019122614': 11.71, '2019122615': 10.92, '2019122616': 7.53, '2019122617': 5.42, '2019122618': 4.84, '2019122619': 4.33, '2019122620': 4.04, '2019122621': 3.81, '2019122622': 3.5, '2019122623': 2.95, '2019122700': 2.41, '2019122

In [9]:
# Add temperature data to dataframe and save 

if GET_TEMPERATURES:
    temp_data = []
    temps_df = pd.DataFrame(columns=['Datetime', 'Temperature', 'ID'])
    
    for i, row in observations_df.iterrows():
        temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        # if invalid data, try 4 more times to get data, if fail, skip
        if temp is None:
            for j in range(3):
                temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
                if temp is not None:
                    break
        if temp is None:
            continue
        dates = list(temp.keys())
        temperature_values = list(temp.values())
        id_vals = [row['id']] * len(dates)
        
        temp_df = pd.DataFrame({
            'Datetime': dates,
            'Temperature': temperature_values,
            'ID': id_vals
        })
        
        temps_df = pd.concat([temps_df, temp_df], ignore_index=True)
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")
    
    temps_df.to_csv('temps.csv', index=False)

    # Save to CSV file
    observations_df.to_csv('observations.csv', index=False)
    print(f"Saved processed dataframe with {len(observations_df)} observations to observations.csv")
    


Processed 0
Processed 10
Processed 20
Processed 30
Processed 40
Processed 50
Processed 60
Processed 70
Processed 80
Processed 90
Processed 100
Processed 110
Processed 120
Processed 130
Processed 140
Processed 150
Processed 160
Processed 170
Processed 180
Processed 190
Processed 200
Processed 210
Processed 220
Processed 230
Processed 240
Processed 250
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Processed 270
Processed 280
Processed 290
Processed 300
Processed 310
Processed 320
Saved processed dataframe with 322 observations to test.csv


In [18]:
if GET_HUMIDITY:
    humidity_data = []
    humidities_df = pd.DataFrame(columns=['Datetime', 'Humidity', 'ID'])
    
    for i, row in observations_df.iterrows():
        humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        # if invalid data, try 4 more times to get data, if fail, skip
        if humidity is None:
            for j in range(3):
                humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
                if humidity is not None:
                    break
        if humidity is None:
            continue
        dates = list(humidity.keys())
        humidity_values = list(humidity.values())
        id_vals = [row['id']] * len(dates)
        humidity_df = pd.DataFrame({
            'Datetime': dates,
            'Humidity': humidity_values,
            'ID': id_vals
        })
        humidities_df = pd.concat([humidities_df, humidity_df], ignore_index=True)
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")
    
    humidities_df.to_csv('humidity_data.csv', index=False)


Processed 0
Processed 10
Processed 20
Processed 30
Processed 40
Processed 50
Processed 60
Processed 70
Processed 80
Processed 90
Processed 100
Processed 110
Processed 120
Processed 130
Processed 140
Processed 150
Processed 160
Processed 170
Processed 180
Processed 190
Processed 200
Processed 210
Processed 220
Processed 230
Processed 240
Processed 250
Error getting humidity data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting humidity data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting humidity data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting humidity data for 1987-05-05 at 41.2955379923,-81.4316247254
Processed 270
Processed 280
Processed 290
Processed 300
Processed 310
Processed 320


In [14]:
"""
T2M	Temperature at 2 Meters
RH2M	Relative Humidity at 2 Meters
WS2M	Wind Speed at 2 Meters
T2MDEW	Dew/Frost Point at 2 Meters
T2MWET	Wet Bulb Temperature at 2 Meters
QV2M	Specific Humidity at 2 Meters
"""
data = []
dataframe = pd.DataFrame(columns=['Datetime','ID', 'Temperature', 'Humidity', 'Wind Speed', 'Dew/Frost Point', 'Wet Bulb Temperature', 'Specific Humidity'])

for i, row in observations_df.iterrows():
    weather_data = get_general_weather(row['longitude'], row['latitude'], row['day'], row['month'], row['year'], ['T2M', 'RH2M', 'WS2M', 'T2MDEW', 'T2MWET', 'QV2M'])
    if weather_data is None:
        for j in range(3):
            weather_data = get_general_weather(row['longitude'], row['latitude'], row['day'], row['month'], row['year'], ['T2M', 'RH2M', 'WS2M', 'T2MDEW', 'T2MWET', 'QV2M'])
            if weather_data is not None:
                break
    if weather_data is None:    
        continue
    dates = list(weather_data['T2M'].keys())
    id_vals = [row['id']] * len(dates)
    temp_values = list(weather_data['T2M'].values())
    humidity_values = list(weather_data['RH2M'].values())
    wind_speed_values = list(weather_data['WS2M'].values())
    dew_frost_values = list(weather_data['T2MDEW'].values())
    wet_bulb_values = list(weather_data['T2MWET'].values())
    specific_humidity_values = list(weather_data['QV2M'].values())
    
    curr_df = pd.DataFrame({
        'Datetime': dates,
        'ID': id_vals,
        'Temperature': temp_values,
        'Humidity': humidity_values,
        'Wind Speed': wind_speed_values,
        'Dew/Frost Point': dew_frost_values,
        'Wet Bulb Temperature': wet_bulb_values,
        'Specific Humidity': specific_humidity_values
    })
    
    dataframe = pd.concat([dataframe, curr_df], ignore_index=True)
    # Print progress every 10 rows
    if i % 10 == 0:
        print(f"Processed {i}")
    
dataframe.to_csv('weather_data.csv', index=False)

  dataframe = pd.concat([dataframe, curr_df], ignore_index=True)


Processed 0
Processed 10
Processed 20
Processed 30
Processed 40
Processed 50
Processed 60
Processed 70
Processed 80
Processed 90
Processed 100
Processed 110
Processed 120
Processed 130
Processed 140
Processed 150
Processed 160
Processed 170
Processed 180
Processed 190
Processed 200
Processed 210
Processed 220
Processed 230
Processed 240
Processed 250
Error getting data for 1987-05-05 at 41.2955379923,-81.4316247254
{'header': 'The POWER Hourly API failed to complete your request; please review the errors below and the POWER Docs (https://power.larc.nasa.gov/docs/).', 'messages': ['Please provide a correct start date. Your start date out of range. The data starts at 2001/01/01.']}
Error getting data for 1987-05-05 at 41.2955379923,-81.4316247254
{'header': 'The POWER Hourly API failed to complete your request; please review the errors below and the POWER Docs (https://power.larc.nasa.gov/docs/).', 'messages': ['Please provide a correct start date. Your start date out of range. The data 

TypeError: 'NoneType' object is not subscriptable