In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import requests
from sklearn.linear_model import LinearRegression


In [2]:
# Global Init Variables
GET_TEMPERATURES = False
GET_HUMIDITY = False


In [3]:
# read both observation files
true_observations_df = pd.read_json('../INaturalist_Scraper/Data/true_observations.json')
false_observations_df = pd.read_json('../INaturalist_Scraper/Data/false_observations.json')

# add powdery mildew bool feature for each collection
true_observations_df['PowderyMildew'] = True
false_observations_df['PowderyMildew'] = False

# concat collections
observations_df = pd.concat([true_observations_df, false_observations_df], ignore_index=True)

# Drop rows with missing values
observations_df = observations_df.dropna()

# remove date, we date_string is better
observations_df.drop('date', axis=1, inplace=True)

In [4]:
# Function to parse date string and extract components
def extract_date_components(date_string):
    import datetime
    # Parse ISO format date string
    dt = datetime.datetime.fromisoformat(date_string.replace('Z', '+00:00'))
    return dt.year, dt.month, dt.day

# Extract relevant data for true observations
lons = []
lats = []
years = []
months = []
days = []

for _, row in observations_df.iterrows():
    lons.append(row['coordinates'][0])
    lats.append(row['coordinates'][1])
    year, month, day = extract_date_components(row['date_string'])
    years.append(year)
    months.append(month)
    days.append(day)
    
observations_df['longitude'] = lons
observations_df['latitude'] = lats
observations_df['year'] = years
observations_df['month'] = months
observations_df['day'] = days

observations_df.sample(10)


Unnamed: 0,id,date_string,coordinates,PowderyMildew,longitude,latitude,year,month,day
1290,177615841,2023-08-05T10:09:48-07:00,"[-122.0981850475, 47.9128670532]",False,-122.098185,47.912867,2023,8,5
630,31718157,2019-08-29T11:24:12+02:00,"[-2.9207187538, 53.3734190866]",False,-2.920719,53.373419,2019,8,29
1150,161652324,2023-05-14T10:32:25+03:00,"[37.7569416667, 55.7701333333]",False,37.756942,55.770133,2023,5,14
559,160767639,2023-05-08T14:32:16+02:00,"[14.3221916667, 50.0478221667]",False,14.322192,50.047822,2023,5,8
1320,77857287,2021-04-30T15:24:16+02:00,"[13.1553416667, 52.5485333333]",False,13.155342,52.548533,2021,4,30
904,114557249,2022-05-01T14:52:00+12:00,"[172.5554206, -43.5604603]",False,172.555421,-43.56046,2022,5,1
1041,210324842,2024-04-27T16:13:00+02:00,"[13.5557527778, 52.5801166667]",False,13.555753,52.580117,2024,4,27
707,16998192,2018-09-26T11:09:26-07:00,"[-123.1875134376, 49.139600254]",False,-123.187513,49.1396,2018,9,26
542,114787611,2022-05-02T15:23:28+02:00,"[14.4693641661, 50.0916976928]",False,14.469364,50.091698,2022,5,2
940,158241682,2023-04-30T15:27:09+02:00,"[16.2112138889, 48.1364194444]",False,16.211214,48.136419,2023,4,30


In [5]:
# Functions to fetch weather data
def get_temps(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = int(f"{year}{month_str}{day_str}")
    
    parameters = {
        "parameters": "T2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": start_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    
    temperatures_total = 0
    try:
        for temperature in data['properties']['parameter']['T2M'].values():
            temperatures_total += (temperature * (9/5)) + 32  # convert to Fahrenheit
        temperature_avg = temperatures_total / 24
        return temperature_avg
    except:
        print(f"Error getting temperature data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None

def get_humidity(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = int(f"{year}{month_str}{day_str}")
    
    parameters = {
        "parameters": "RH2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": start_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    
    humidity_total = 0
    try:
        for humidity in data['properties']['parameter']['RH2M'].values():
            humidity_total += humidity
        humidity_avg = humidity_total / 24
        return humidity_avg
    except:
        print(f"Error getting humidity data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None

In [6]:
# Add temperature data to dataframe and save 

if GET_TEMPERATURES:

    # Call get_temps for each row
    temperatures = []
    for i, row in observations_df.iterrows():
        temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        temperatures.append(temp)
        
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")

    # Add temps to dataframe
    observations_df['Temp'] = temperatures

    # Drop empty rows
    observations_df = observations_df.dropna(subset=['Temp'])

    # Save to CSV file
    observations_df.to_csv('observations_with_temps.csv', index=False)
    print(f"Saved processed dataframe with {len(observations_df)} observations to observations_with_temps.csv")
    
if GET_HUMIDITY:
    
        # Call get_temps for each row
    humidities = []
    for i, row in observations_df.iterrows():
        humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        humidities.append(humidity)
        
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")

    # Add temps to dataframe
    observations_df['Humidity'] = humidities

    # Drop empty rows
    observations_df = observations_df.dropna(subset=['Humidity'])

    # Save to CSV file
    observations_df.to_csv('observations_with_humidity.csv', index=False)
    print(f"Saved processed dataframe with {len(observations_df)} observations to observations_with_humidity.csv")


In [7]:
observations_df.head()

Unnamed: 0,id,date_string,coordinates,PowderyMildew,longitude,latitude,year,month,day
0,48688154,2020-06-06T14:17:01-04:00,"[-73.9520673641, 40.7700205967]",True,-73.952067,40.770021,2020,6,6
1,206356226,2024-04-09T19:08:00-04:00,"[-76.3073433611, 38.9860498889]",True,-76.307343,38.98605,2024,4,9
2,162498461,2023-05-19T16:44:24+02:00,"[12.9216153547, 48.6943073404]",True,12.921615,48.694307,2023,5,19
3,116476822,2022-05-10T16:37:09-04:00,"[-73.95191313710001, 40.7701800205]",True,-73.951913,40.77018,2022,5,10
5,161092221,2023-05-11T15:45:23+01:00,"[-0.3697666526, 53.7434492925]",True,-0.369767,53.743449,2023,5,11


In [8]:
hum_df = pd.read_csv('observations_with_humidity.csv')
temp_df = pd.read_csv('observations_with_temps.csv')

weather_df = pd.merge(hum_df, temp_df, on=['id'], how='inner')


Unnamed: 0,id,date_string_x,coordinates_x,PowderyMildew_x,longitude_x,latitude_x,year_x,month_x,day_x,Humidity,date_string_y,coordinates_y,PowderyMildew_y,longitude_y,latitude_y,year_y,month_y,day_y,Temp
0,48688154,2020-06-06T14:17:01-04:00,"[-73.9520673641, 40.7700205967]",True,-73.952067,40.770021,2020,6,6,78.492917,2020-06-06T14:17:01-04:00,"[-73.9520673641, 40.7700205967]",True,-73.952067,40.770021,2020,6,6,72.44525
1,206356226,2024-04-09T19:08:00-04:00,"[-76.3073433611, 38.9860498889]",True,-76.307343,38.98605,2024,4,9,82.80375,2024-04-09T19:08:00-04:00,"[-76.3073433611, 38.9860498889]",True,-76.307343,38.98605,2024,4,9,59.2625
2,162498461,2023-05-19T16:44:24+02:00,"[12.9216153547, 48.6943073404]",True,12.921615,48.694307,2023,5,19,78.535417,2023-05-19T16:44:24+02:00,"[12.9216153547, 48.6943073404]",True,12.921615,48.694307,2023,5,19,53.86175
3,116476822,2022-05-10T16:37:09-04:00,"[-73.95191313710001, 40.7701800205]",True,-73.951913,40.77018,2022,5,10,58.924167,2022-05-10T16:37:09-04:00,"[-73.95191313710001, 40.7701800205]",True,-73.951913,40.77018,2022,5,10,52.541
4,161092221,2023-05-11T15:45:23+01:00,"[-0.3697666526, 53.7434492925]",True,-0.369767,53.743449,2023,5,11,88.689583,2023-05-11T15:45:23+01:00,"[-0.3697666526, 53.7434492925]",True,-0.369767,53.743449,2023,5,11,52.5305


In [None]:
hum_df.head()

In [None]:
temp_df.head()