In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import requests
from sklearn.linear_model import LinearRegression

In [2]:
# Global Init Variables
GET_TEMPERATURES = True
GET_HUMIDITY = True


In [3]:
# read both observation files
true_observations_df = pd.read_json('../INaturalist_Scraper/Data/true_observations.json')
false_observations_df = pd.read_json('../INaturalist_Scraper/Data/false_observations.json')

# add powdery mildew bool feature for each collection
true_observations_df['PowderyMildew'] = True
false_observations_df['PowderyMildew'] = False

# concat collections
observations_df = pd.concat([true_observations_df, false_observations_df], ignore_index=True)

# Drop rows with missing values
observations_df = observations_df.dropna()

# remove date, we date_string is better
observations_df.drop('date', axis=1, inplace=True)

In [4]:
# Function to parse date string and extract components
def extract_date_components(date_string):
    import datetime
    # Parse ISO format date string
    dt = datetime.datetime.fromisoformat(date_string.replace('Z', '+00:00'))
    return dt.year, dt.month, dt.day

# Extract relevant data for true observations
lons = []
lats = []
years = []
months = []
days = []

for _, row in observations_df.iterrows():
    lons.append(row['coordinates'][0])
    lats.append(row['coordinates'][1])
    year, month, day = extract_date_components(row['date_string'])
    years.append(year)
    months.append(month)
    days.append(day)
    
observations_df['longitude'] = lons
observations_df['latitude'] = lats
observations_df['year'] = years
observations_df['month'] = months
observations_df['day'] = days

observations_df.sample(10)


Unnamed: 0,id,date_string,coordinates,PowderyMildew,longitude,latitude,year,month,day
229,44799106,2020-05-03T12:12:00+01:00,"[-2.8536102022, 53.3690479674]",False,-2.85361,53.369048,2020,5,3
307,35652836,2019-11-14T09:59:17+13:00,"[172.6634286014, -43.5935721845]",False,172.663429,-43.593572,2019,11,14
54,229935338,2024-07-17T12:35:16-04:00,"[-80.0098351997, 40.4231022]",True,-80.009835,40.423102,2024,7,17
300,116002014,2022-05-01T12:09:00+12:00,"[172.6415343167, -43.5876706]",False,172.641534,-43.587671,2022,5,1
141,42458100,2020-04-17T12:47:00+02:00,"[9.7560836551, 52.3416611288]",True,9.756084,52.341661,2020,4,17
263,46234674,2020-04-21T17:01:51+02:00,"[-1.06378617, 53.9571]",False,-1.063786,53.9571,2020,4,21
100,45916993,2020-05-14T17:15:26-04:00,"[-73.9523271193, 40.7704007998]",True,-73.952327,40.770401,2020,5,14
173,210921745,2024-04-28T14:58:54+01:00,"[-0.3670084382, 53.7289734935]",False,-0.367008,53.728973,2024,4,28
171,209264676,2024-04-25T12:48:40+02:00,"[8.8088271953, 50.8071629378]",False,8.808827,50.807163,2024,4,25
62,110641260,2022-04-07T11:40:15+08:00,"[118.7802939, 32.1205746997]",True,118.780294,32.120575,2022,4,7


In [5]:
# Functions to fetch weather data
def get_temps(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    temp_dict = {}
    parameters = {
        "parameters": "T2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for key, temperature in data['properties']['parameter']['T2M'].items():
            data['properties']['parameter']['T2M'][key] = '{0:.2f}'.format((temperature * (9/5)) + 32) # convert to Fahrenheit
    except:
        print(f"Error getting temperature data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return data['properties']['parameter']['T2M']

# def get_humidity(lon, lat, day, month, year):
#     url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
#     month_str = str(month).zfill(2)  # make sure is 2 length
#     day_str = str(day).zfill(2)      # same as above
#     start_date = int(f"{year}{month_str}{day_str}")
    
#     parameters = {
#         "parameters": "RH2M",
#         "community": "AG",
#         "latitude": lat,
#         "longitude": lon,
#         "start": start_date,
#         "end": start_date,
#         "format": "JSON"
#     }

#     response = requests.get(url, params=parameters)
#     data = response.json()
    
#     humidity_total = 0
#     try:
#         for humidity in data['properties']['parameter']['RH2M'].values():
#             humidity_total += humidity
#         humidity_avg = humidity_total / 24
#         return humidity_avg
#     except:
#         print(f"Error getting humidity data for {year}-{month_str}-{day_str} at {lat},{lon}")
#         return None

In [10]:
def get_humidity(lon, lat, day, month, year):
    url = "https://power.larc.nasa.gov/api/temporal/hourly/point"
    
    month_str = str(month).zfill(2)  # make sure is 2 length
    day_str = str(day).zfill(2)      # same as above
    start_date = f"{year}-{month_str}-{day_str}"

    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") 
    end_date = start_date + datetime.timedelta(days=7)
    start_date -= datetime.timedelta(days=7)

    start_date = start_date.strftime("%Y%m%d")
    end_date = end_date.strftime("%Y%m%d")
    
    humidity_dict = {}
    parameters = {
        "parameters": "RH2M",
        "community": "AG",
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(url, params=parameters)
    data = response.json()
    try:
        for key, humidity in data['properties']['parameter']['RH2M'].items():
            data['properties']['parameter']['RH2M'][key] = '{0:.2f}'.format(humidity)# convert to Fahrenheit
    except:
        print(f"Error getting humidity data for {year}-{month_str}-{day_str} at {lat},{lon}")
        return None
    
    return data['properties']['parameter']['RH2M']


In [9]:
# Add temperature data to dataframe and save 

if GET_TEMPERATURES:
    temp_data = []
    temps_df = pd.DataFrame(columns=['Datetime', 'Temperature', 'ID'])
    
    for i, row in observations_df.iterrows():
        temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        # if invalid data, try 4 more times to get data, if fail, skip
        if temp is None:
            for j in range(3):
                temp = get_temps(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
                if temp is not None:
                    break
        if temp is None:
            continue
        dates = list(temp.keys())
        temperature_values = list(temp.values())
        id_vals = [row['id']] * len(dates)
        
        temp_df = pd.DataFrame({
            'Datetime': dates,
            'Temperature': temperature_values,
            'ID': id_vals
        })
        
        temps_df = pd.concat([temps_df, temp_df], ignore_index=True)
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")
    
    temps_df.to_csv('temps.csv', index=False)

    # Save to CSV file
    observations_df.to_csv('observations.csv', index=False)
    print(f"Saved processed dataframe with {len(observations_df)} observations to observations.csv")
    



# if GET_HUMIDITY:
    
#         # Call get_temps for each row
#     humidities = []
#     for i, row in observations_df.iterrows():
#         humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
#         humidities.append(humidity)
        
#         # Print progress every 10 rows
#         if i % 10 == 0:
#             print(f"Processed {i}")

#     # Add temps to dataframe
#     observations_df['Humidity'] = humidities

#     # Drop empty rows
#     observations_df = observations_df.dropna(subset=['Humidity'])

#     # Save to CSV file
#     observations_df.to_csv('test_2.csv', index=False)
#     print(f"Saved processed dataframe with {len(observations_df)} observations to test_2.csv")


Processed 0
Processed 10
Processed 20
Processed 30
Processed 40
Processed 50
Processed 60
Processed 70
Processed 80
Processed 90
Processed 100
Processed 110
Processed 120
Processed 130
Processed 140
Processed 150
Processed 160
Processed 170
Processed 180
Processed 190
Processed 200
Processed 210
Processed 220
Processed 230
Processed 240
Processed 250
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Error getting temperature data for 1987-05-05 at 41.2955379923,-81.4316247254
Processed 270
Processed 280
Processed 290
Processed 300
Processed 310
Processed 320
Saved processed dataframe with 322 observations to test.csv


In [16]:
if GET_HUMIDITY:
    humidity_data = []
    humidities_df = pd.DataFrame(columns=['Datetime', 'Humidity', 'ID'])
    
    for i, row in observations_df.iterrows():
        humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
        # if invalid data, try 4 more times to get data, if fail, skip
        if humidity is None:
            for j in range(3):
                humidity = get_humidity(row['longitude'], row['latitude'], row['day'], row['month'], row['year'])
                if humidity is not None:
                    break
        if humidity is None:
            continue
        dates = list(humidity.keys())
        humidity_values = list(humidity.values())
        id_vals = [row['id']] * len(dates)
        humidity_df = pd.DataFrame({
            'Datetime': dates,
            'Humidity': humidity_values,
            'ID': id_vals
        })
        humidities_df = pd.concat([humidities_df, humidity_df], ignore_index=True)
        # Print progress every 10 rows
        if i % 10 == 0:
            print(f"Processed {i}")
        if i == 1:
            break
    
    humidities_df.to_csv('humidity_data.csv', index=False)


Processed 0
