In [1]:
import torch
import pandas as pd
import numpy as np
import argparse
import geopandas as gpd
from pandas.tseries.offsets import DateOffset
from shapely.geometry import Polygon
import os
from datetime import datetime
from collections import namedtuple
import ast
import pickle
from dataset_constructor_funcs import df_to_tensor, df_to_y_tensor, compute_adjacency_matrix
from crane import main

In [2]:

temporal_res = '2monthly'
context_size = 5
map_size = 'small'
box_length_m = 500
tsteps_to_study = 32


In [3]:
gdf, dataset_specs = main(temporal_res=temporal_res, context_size=context_size, map_size=map_size, box_length_m=box_length_m, tsteps_to_study=tsteps_to_study)

<class 'geopandas.geodataframe.GeoDataFrame'>
<class 'geopandas.geodataframe.GeoDataFrame'>


  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1
  gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gd

<class 'geopandas.geodataframe.GeoDataFrame'>
<class 'geopandas.geodataframe.GeoDataFrame'>
<class 'geopandas.geodataframe.GeoDataFrame'>


  combined_gdf['geoid'] = pd.factorize(list(zip(combined_gdf['lat'], combined_gdf['long'])))[0]


gdf columns Index(['geoid', 'bimonth_id', 'counts', 'lat', 'long', 'season_indicator',
       'year'],
      dtype='object')
gdf index RangeIndex(start=0, stop=42816, step=1)


In [4]:
gdf

Unnamed: 0,geoid,bimonth_id,counts,lat,long,season_indicator,year
0,0,148,0.0,28.004994,-97.037430,3,2000
1,1,148,0.0,28.009505,-97.037348,3,2000
2,2,148,0.0,28.014016,-97.037267,3,2000
3,3,148,0.0,28.018527,-97.037185,3,2000
4,4,148,0.0,28.023038,-97.037103,3,2000
...,...,...,...,...,...,...,...
42811,1333,179,0.0,28.220609,-96.687125,5,2011
42812,1334,179,0.0,28.220523,-96.682034,5,2011
42813,1335,179,0.0,28.220437,-96.676942,5,2011
42814,1336,179,2.0,28.224946,-96.676845,5,2011


In [6]:
dataset_specs

{'lookback': 5,
 'time_name': 'bimonth_id',
 'space_name': 'geoid',
 'target_name': 'counts',
 'static': ['lat', 'long'],
 'dynamic': [],
 'temporal': ['season_indicator', 'year'],
 'latlong': True}

In [11]:
gdf_new = gdf.set_index([f'geoid', 'bimonth_id'], drop=True)
gdf_new

Unnamed: 0_level_0,Unnamed: 1_level_0,counts,lat,long,season_indicator,year
geoid,bimonth_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,148,0.0,28.004994,-97.037430,3,2000
1,148,0.0,28.009505,-97.037348,3,2000
2,148,0.0,28.014016,-97.037267,3,2000
3,148,0.0,28.018527,-97.037185,3,2000
4,148,0.0,28.023038,-97.037103,3,2000
...,...,...,...,...,...,...
1333,179,0.0,28.220609,-96.687125,5,2011
1334,179,0.0,28.220523,-96.682034,5,2011
1335,179,0.0,28.220437,-96.676942,5,2011
1336,179,2.0,28.224946,-96.676845,5,2011


In [105]:

"""
Functions that take a T x S x F tensor and output x, y, adjacency matrix
"""
def df_to_tensor(df, type_='dynamic', lookback=5, time_name='bimonth_id', space_name='geoid', target_name='counts', static=None, dynamic=None, temporal=None, latlong=True, ):
    """
    Converts a dataframe into a torch tensor of shape (T, S, F + lookback), where:
      - T: number of timesteps (based on the temporal id column)
      - S: number of spatial bins (rows for each timestep)
      - F: number of features (columns defined by 'features' param, default ['season_indicator', 'year', 'lat', 'long'])
      - lookback: number of lagged counts to include, from t-1 to t-lookback

    For each spatial tract s at timestep t, the lagged count columns are taken from the
    'target_name' column at the same spatial tract in previous timesteps. Missing lags are filled with NaN.

    Parameters:
      df: the input dataframe (expected to include the target_name column)
      lookback: number of past timesteps to include as features
      time_name: column name for temporal id
      space_name: column name for spatial id
      target_name: column name for counts
      features: list of feature column names; if None, defaults to ['season_indicator', 'year', 'lat', 'long'] depending on latlong flag
      latlong: whether to include 'lat' and 'long' in default features

    Returns:
      A torch tensor of shape (T, S, F + lookback)
    """    

    if type_ == 'dynamic':

      feats = [space_name, time_name, target_name] + dynamic

    elif type_ == 'static':

      feats = [space_name] + static
    
    elif type_ == 'temporal':
      
      feats = [time_name] + temporal

    else: 
      raise ValueError('dataset type must be dynamic, static, or temporal')
  
    print('feats', feats)
    df = df[feats]

    sys.exit(0)

    # Determine feature columns
    if features is None:
        feature_cols = [target_name]
    else:
        feature_cols = features

    # Determine the name of the temporal id column
    id_col = time_name

    # Get sorted list of unique timesteps
    timesteps = sorted(df[id_col].unique())

    tensor_list = []
    prev_counts_list = []  # list to hold counts tensors from previous timesteps

    for t in timesteps:
        sub_df = df[df[id_col] == t].copy().sort_values(by=space_name)
        features_tensor = torch.tensor(sub_df[feature_cols].values, dtype=torch.float)

        # Build list of lagged count tensors
        lag_tensors = []
        for k in range(1, lookback + 1):
            if len(prev_counts_list) >= k:
                lag_tensors.append(prev_counts_list[k - 1])
            else:
                # Backfill missing lags with NaN for the first lookback timesteps
                lag_tensors.append(torch.full((features_tensor.shape[0], 1), float('nan'), dtype=torch.float))
        lag_tensor = torch.cat(lag_tensors, dim=1)  # shape: (S, lookback)

        # Concatenate features with lag tensors
        combined_tensor = torch.cat([features_tensor, lag_tensor], dim=1)  # shape: (S, F + lookback)
        tensor_list.append(combined_tensor)

        # Update prev_counts_list with current counts tensor
        current_counts = sub_df[target_name].values
        current_counts_tensor = torch.tensor(current_counts, dtype=torch.float).unsqueeze(1)
        prev_counts_list.insert(0, current_counts_tensor)
        if len(prev_counts_list) > lookback:
            prev_counts_list.pop()

    # Stack along time dimension
    result_tensor = torch.stack(tensor_list, dim=0)  # shape: (T, S, F + lookback)
    return result_tensor


In [89]:
dataset_specs

{'lookback': 5,
 'time_name': 'bimonth_id',
 'space_name': 'geoid',
 'target_name': 'counts',
 'static': ['lat', 'long'],
 'dynamic': [],
 'temporal': ['season_indicator', 'year'],
 'latlong': True}

In [90]:
gdf

Unnamed: 0,geoid,bimonth_id,counts,lat,long,season_indicator,year
0,0,148,0.0,28.004994,-97.037430,3,2000
1,1,148,0.0,28.009505,-97.037348,3,2000
2,2,148,0.0,28.014016,-97.037267,3,2000
3,3,148,0.0,28.018527,-97.037185,3,2000
4,4,148,0.0,28.023038,-97.037103,3,2000
...,...,...,...,...,...,...,...
42811,1333,179,0.0,28.220609,-96.687125,5,2011
42812,1334,179,0.0,28.220523,-96.682034,5,2011
42813,1335,179,0.0,28.220437,-96.676942,5,2011
42814,1336,179,2.0,28.224946,-96.676845,5,2011


In [91]:
dataset_specs

{'lookback': 5,
 'time_name': 'bimonth_id',
 'space_name': 'geoid',
 'target_name': 'counts',
 'static': ['lat', 'long'],
 'dynamic': [],
 'temporal': ['season_indicator', 'year'],
 'latlong': True}

In [92]:
idxs = []

In [93]:
idxs

[]

In [74]:
import torch
import pandas as pd
import numpy as np
import argparse
import geopandas as gpd
from pandas.tseries.offsets import DateOffset
from shapely.geometry import Polygon
import os
from datetime import datetime
from collections import namedtuple
import ast
import pickle
from dataset_constructor_funcs import df_to_tensor, df_to_y_tensor, compute_adjacency_matrix
from crane import main

In [106]:
dataset_specs

{'lookback': 5,
 'time_name': 'bimonth_id',
 'space_name': 'geoid',
 'target_name': 'counts',
 'static': ['lat', 'long'],
 'dynamic': [],
 'temporal': ['season_indicator', 'year'],
 'latlong': True}

In [107]:
df_to_tensor(gdf, type_='temporal', **dataset_specs)

feats ['bimonth_id', 'season_indicator', 'year']


NameError: name 'sys' is not defined

# ARCHIVE

## Supported temporal scales: 

In [6]:
"""
Global variables
"""

# defining date range
DATE_RANGE_TRANSLATOR = {  
    'daily': 'D',
    'weekly': 'W',
    'biweekly': '2W',
    'monthly': 'ME',
    '2monthly': '2ME',
}

# how much temporal buffer to give based on resolution
DATE_OFFSET_TRANSLATOR = {  
    'daily': 1,
    'weekly': 7,
    'biweekly': 14,
    'monthly': 30,
    '2monthly': 60,
}
# naming the temporal column
DATE_NAME_TRANSLATOR = {  
    'daily': 'day',
    'weekly': 'week',
    'biweekly': 'biweek',
    'monthly': 'month',
    '2monthly': 'bimonth',
    '3monthly': 'trimonth',
    'seasonal': 'season'
}

MONTH_PAIRS_TRANSLATOR = {
    '2monthly': ["02-28", "04-30", "10-20", "12-25"],
    '3monthly': ["01-31", "04-30", "10-20"]
}

MAP_SIZE_TRANSLATOR = {
    'medium': {
        'y_left_lower_line': 0,
        'y_right_lower_line': 0.45,
        'y_left_upper_line': 0.35,
        'y_right_upper_line': 0.95
    },
    'small': {
        'y_left_lower_line': 0.06,
        'y_right_lower_line': 0.72,
        'y_left_upper_line': 0.3,
        'y_right_upper_line': 0.5
    }
}

SEASONAL_TRANSLATOR = {
    9: 0,
    10: 1,
    11: 2,
    12: 3,
    1: 4,
    2: 5,
    3: 6, 
    4: 7
}

# meters per degree lat or long
METERS_PER_DEGREE = 111111

In [7]:
context_size=5
box_length_m=500
map_size='small'
years_through_2011=60
tsteps_to_study=32

In [8]:

# read and turn into a geopandas dataframe
df = pd.read_csv('../../data/raw/aerial_surv/WHCR_Aerial_Observations_1950_2011.txt', encoding='latin1', sep='\t')

In [9]:
# Function to generate a grid of boxes
def generate_grid(bbox, spacing, crs):
    """
    Generate box grid based on min x, min y, max x, and max y (LONG/LAT)
    Spacing: Space between each box in degrees
    Crs: Coordinate reference system
    """
    METERS_PER_DEGREE = 111111

    if crs.to_string() == 'EPSG:26914':
        spacing = spacing * METERS_PER_DEGREE

    minx, miny, maxx, maxy = bbox
    x_coords = np.arange(minx, maxx, spacing)
    y_coords = np.arange(miny, maxy, spacing)

    grid = []
    for x in x_coords:
        for y in y_coords:
            grid.append(Polygon([(x, y), (x + spacing, y), (x + spacing, y + spacing), (x, y + spacing), (x, y)]))
    return gpd.GeoDataFrame({'geometry': grid}, crs=crs)


def sort_date_to_range(input_date, temporal_res):
    """
    Determines the range in which a given date falls based on month-day pairs.
    
    Parameters:
    - input_date (datetime): The date to evaluate.
    
    Returns:
    - str: The range in the format "year-month-day_to_year-month-day".
    """
    # set possible dates for range
    yr = input_date.year
    curr_year = [datetime.strptime(f"{yr}-{md}", "%Y-%m-%d") for md in MONTH_PAIRS_TRANSLATOR[temporal_res]]
    last_year = [datetime.strptime(f"{yr - 1}-{md}", "%Y-%m-%d") for md in MONTH_PAIRS_TRANSLATOR[temporal_res]]
    next_year = [datetime.strptime(f"{yr + 1}-{md}", "%Y-%m-%d") for md in MONTH_PAIRS_TRANSLATOR[temporal_res]]

    # extract correct range
    all_dates = last_year + curr_year + next_year
    idx = np.searchsorted(all_dates, input_date)
    correct_range = f"{all_dates[idx - 1].date()}_to_{all_dates[idx].date()}"

    return correct_range


def set_date_range_2_3mo(year, temporal_res):

    # Specify the year

    if temporal_res == '2monthly':
        # Create a date range for the specific dates
        dates = [
            pd.Timestamp(year, 10, 20),    # October 20
            pd.Timestamp(year, 12, 24),    # December 24
            pd.Timestamp(year + 1, 2, 27),  # Feb 28
            pd.Timestamp(year + 1, 4, 30)      # April 30
        ]
    elif temporal_res == '3monthly':
        # Create a date range for the specific dates
        dates = [
            pd.Timestamp(year, 10, 20),    # October 20
            pd.Timestamp(year + 1, 1, 31),    # December 31
            pd.Timestamp(year + 1, 4, 30)      # April 30
        ]

    # Convert to a Pandas DateTimeIndex
    return pd.DatetimeIndex(dates)

def determine_season_year(date):
    
    # return nan if between april 31 and oct 19
    if (5 <= date.month <= 9) or (date.month == 10 and date.day < 20):
        return np.nan

    return date.year if date.month >= 7 else date.year - 1

import pandas as pd

def date_range_gap(curr_drange, last_drange):
    """
    Given two date-range strings of the form:
      - "YYYY-02-28_to_YYYY-04-30" (spring; order 2)
      - "YYYY-12-25_to_YYYY-02-28" (winter; order 1, assigned to the end year)
      - "YYYY-10-20_to_YYYY-12-25" (fall; order 3)
    returns the number of “steps” between them.
    """
    def parse_range(s):
        start_str, end_str = s.split("_to_")
        sy, sm, sd = start_str.split("-")
        ey, em, ed = end_str.split("-")
        # For winter period, we assign the order to the ending year.
        if sm == "12" and sd == "25":
            order = 1
            year = int(ey)
        elif sm == "02" and sd == "28":
            order = 2
            year = int(sy)  # spring: both dates in the same year
        elif sm == "10" and sd == "20":
            order = 3
            year = int(sy)  # fall: both dates in the same year
        else:
            raise ValueError("Unexpected date range format: " + s)
        return year, order

    # Parse each range
    last_year, last_order = parse_range(last_drange)
    curr_year, curr_order = parse_range(curr_drange)

    # “Linearize” the three periods per year by assigning each an index.
    last_index = last_year * 3 + (last_order - 1)
    curr_index = curr_year * 3 + (curr_order - 1)
    return curr_index - last_index

def is_valid_bimonth_name(name):
    """
    Returns True if the bimonth name is one of the three allowed formats:
      - "YYYY-02-28_to_YYYY-04-30"
      - "YYYY-12-25_to_YYYY-02-28"
      - "YYYY-10-20_to_YYYY-12-25"
    Otherwise returns False.
    """

    try:

        parts = name.split("_to_")
        if len(parts) != 2:
            return False
        start, end = parts
        s_year, s_month, s_day = start.split("-")
        e_year, e_month, e_day = end.split("-")
        # Spring: 02-28 -> 04-30; same year.
        if s_month == "02" and s_day == "28" and e_month == "04" and e_day == "30" and s_year == e_year:
            return True
        # Winter: 12-25 -> 02-28; note: the years differ (the winter period “belongs” to the year of Feb 28).
        if s_month == "12" and s_day == "25" and e_month == "02" and e_day == "28":
            return True
        # Fall: 10-20 -> 12-25; same year.
        if s_month == "10" and s_day == "20" and e_month == "12" and e_day == "25" and s_year == e_year:
            return True
        return False


    except Exception:
        return False

def recalibrate_bimonth_ids(gdf, col_id='bimonth', col_name='bimonth_name'):
    """
    Given a GeoDataFrame (or DataFrame) with columns:
      - "bimonth" (integer ID) and
      - "bimonth_name" (date range string),
    this function does three things:
      1. Drops any row whose bimonth_name is not valid.
      2. Checks the gap between successive rows in both the 'bimonth' and 'bimonth_name'
         columns.
      3. Re-assigns (re-indexes) the bimonth IDs so that for each row, its ID equals
         the previous row's ID plus the gap implied by the date range names.
         (For example, if the name gap is 2 but the IDs jump by 3, the new ID will be
          previous_ID + 2.)
    """
    # Step 1: Drop rows with invalid id.
    if col_id == 'bimonth':
        invalid_indices = []
        for idx, row in gdf.iterrows():
            name = row[col_name]
            if not is_valid_bimonth_name(name):
                invalid_indices.append(idx)
        if invalid_indices:
            print(f"Dropping rows with invalid {col_name} at indices:", invalid_indices)
            gdf = gdf.drop(index=invalid_indices).reset_index(drop=True)

    gdf = gdf.sort_values(col_id).reset_index(drop=True)

    # Step 2 & 3: Walk through the rows and reassign bimonth IDs.
    new_ids = []
    for i, row in gdf.iterrows():
        if i == 0:
            # For the first row, we can either keep the original ID or start anew.
            new_id = row[col_id]
            new_ids.append(new_id)
        else:
            prev_name = gdf.loc[i - 1, col_name]
            curr_name = row[col_name]
            try:
                expected_gap = date_range_gap(curr_name, prev_name)
            except ValueError as err:
                # If parsing fails for some reason, skip this row.
                print(f"Error parsing row {i}: {err}. Dropping row.")
                continue
            # Calculate new ID as previous new ID plus the expected gap.
            new_id = new_ids[i - 1] + expected_gap
            # (Optional) Report if the original gap did not match.
            original_gap = row[col_id] - gdf.loc[i - 1, col_id]
            # if original_gap != expected_gap:
                # print(f"Row {i}: original ID gap ({original_gap}) does not match expected ({expected_gap}). Resetting ID.")
            new_ids.append(new_id)
    gdf[col_id] = new_ids
    return gdf


def reindex_consecutive(col):
    # Get the unique values in sorted order.
    sorted_unique = sorted(col.unique())
    # Create a mapping: original value -> consecutive integer starting at 1.
    mapping = {old_val: new_val for new_val, old_val in enumerate(sorted_unique, start=1)}
    # Replace each value in the column with its new consecutive number.
    return col.map(mapping)


In [10]:
df_test = pd.DataFrame({'lat': [1, 1, 2, 2, 3, 3], 'long': ['a', 'a', 'b', 'b', 'c', 'c']})
df_test.groupby('lat').first()

Unnamed: 0_level_0,long
lat,Unnamed: 1_level_1
1,a
2,b
3,c


In [11]:
def read_asurv(df, temporal_res):

    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.X, df.Y), crs='EPSG:26914')

    # cut years based on function parameter
    gdf = gdf[gdf['Year'].isin(gdf['Year'].unique()[-years_through_2011:])]

    # add time resolution
    gdf['date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

    gdf = gdf[gdf['Month'].isin([10, 11, 12, 1, 2, 3, 4])]

    if temporal_res == 'seasonal':

        gdf['season'] = gdf['date'].apply(determine_season_year)
        gdf['season_name'] = gdf['season']

    elif temporal_res == '2monthly':

        gdf['season'] = gdf['date'].apply(determine_season_year)
        gdf = gdf.dropna(subset='season')
        gdf['season'] = gdf['season'].astype('int')

        all_dates = pd.DatetimeIndex([])
        
        for szn in gdf['season'].unique():

            # set month ID
            curr_dates = set_date_range_2_3mo(year=szn, temporal_res=temporal_res)
            gdf.loc[gdf['season'] == szn, DATE_NAME_TRANSLATOR[temporal_res]] = np.searchsorted(curr_dates, gdf[gdf['season'] == szn]['date'])
            gdf = gdf.sort_values(by=['season', DATE_NAME_TRANSLATOR[temporal_res]]).reset_index(drop=True)
            gdf[DATE_NAME_TRANSLATOR[temporal_res]] = pd.factorize(list(zip(gdf['season'], gdf[DATE_NAME_TRANSLATOR[temporal_res]])))[0] + 1

            all_dates = np.concatenate((all_dates, curr_dates))

        gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'] = gdf['date'].apply(lambda d: sort_date_to_range(d, temporal_res))
        
        # only keep dates that fall in the designated 2 month range. So, nothing in the summertime (april to october)
        def extract_mo_day(date_rnge):
            return date_rnge.split('_')[0].split('-')[1] + '-' + date_rnge.split('_')[0].split('-')[2]

        valid_idxs = gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'].apply(lambda rnge: extract_mo_day(rnge) != '04-30')
        gdf = gdf[valid_idxs]

        gdf = recalibrate_bimonth_ids(gdf)
            
    else:

        all_dates = pd.date_range(start=gdf['date'].min() - DateOffset(days=DATE_OFFSET_TRANSLATOR[temporal_res]), end=gdf['date'].max() + DateOffset(days=DATE_OFFSET_TRANSLATOR[temporal_res]), freq=DATE_RANGE_TRANSLATOR[temporal_res])
        gdf[DATE_NAME_TRANSLATOR[temporal_res]] = np.searchsorted(all_dates, gdf['date'])  
        # add names for weeks for data clarity
        bin_names = {i + 1: f'{all_dates[i].date()}_to_{all_dates[i + 1].date()}' for i in range(len(all_dates) - 1)}

        gdf['season'] = gdf['date'].apply(determine_season_year)
        gdf = gdf.dropna(subset=['season'])

        gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'] = gdf[DATE_NAME_TRANSLATOR[temporal_res]].map(bin_names)
        gdf[DATE_NAME_TRANSLATOR[temporal_res]] = reindex_consecutive(gdf[DATE_NAME_TRANSLATOR[temporal_res]])
        

    gdf['count'] = gdf['WHITE'].fillna(0) + gdf['JUVE'].fillna(0) + gdf['UNK'].fillna(0) 

    complete_idx_square = True
    keep_geometry_col = False
    save_shp_folder = False

    if keep_geometry_col:
        columns_of_interest = ['date', f"{DATE_NAME_TRANSLATOR[temporal_res]}", f"{DATE_NAME_TRANSLATOR[temporal_res]}_name", 'X', 'Y', 'season', 'count', 'geometry']
    else:
        columns_of_interest = ['date', f"{DATE_NAME_TRANSLATOR[temporal_res]}", f"{DATE_NAME_TRANSLATOR[temporal_res]}_name", 'X', 'Y', 'season', 'count']

    return gdf[columns_of_interest]

In [12]:
temporal_res='seasonal'
gdf = read_asurv(df, temporal_res='seasonal')

In [13]:
type(gdf)

pandas.core.frame.DataFrame

In [14]:
all_gdfs = []
for tstep in np.sort(gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'].unique()):
    
    filtered_gdf = gdf[gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'] == tstep]
    # Read in bounding box from data folder
    with open("../../data/raw/aerial_surv/boxes_total_bounds.txt", "r") as file:
        bounds = file.read()
    
    min_x, min_y, max_x, max_y = ast.literal_eval(bounds)

    # MAKE SURE we are in the CRS that measures by meters, not lat/long
    assert (filtered_gdf.crs.to_string() == 'EPSG:26914')
    grid_cells = []
    for x in np.arange(min_x, max_x, box_length_m):
        for y in np.arange(min_y, max_y, box_length_m):
            grid_cells.append(Polygon([
                (x, y),
                (x + box_length_m, y),
                (x + box_length_m, y + box_length_m),
                (x, y + box_length_m)
            ]))

    # Create a GeoDataFrame for the grid
    full_grid = gpd.GeoDataFrame({'geometry': grid_cells}, crs=filtered_gdf.crs)

    # Perform a spatial join to count the number of points in each grid cell
    joined = gpd.sjoin(filtered_gdf, full_grid, how='left', predicate='within')
    counts = joined.groupby('index_right').agg({'count': 'sum'})

    # Add the counts to the grid GeoDataFrame
    full_grid['counts'] = counts
    # print(filtered_gdf.shape, filtered_gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}'].shape, type(filtered_gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}']))
    full_grid[f'{DATE_NAME_TRANSLATOR[temporal_res]}_id'] = filtered_gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}'].unique()[0]
    full_grid[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'] = tstep

    # add unique date
    gdf[gdf[f'{DATE_NAME_TRANSLATOR[temporal_res]}_name'] == tstep]['date'].unique()[0]

    full_grid['counts'] = full_grid['counts'].fillna(0)

    # add indicator
    full_grid['season_indicator'] = SEASONAL_TRANSLATOR[datetime.strptime(tstep.split('_')[0], '%Y-%m-%d').month]

    full_grid['year'] = datetime.strptime(tstep.split('_')[0], '%Y-%m-%d').year
    
    # print(f"unique counts for {tstep}: {full_grid['counts'].unique()}")
    all_gdfs.append(full_grid)

combined_gdf = pd.concat(all_gdfs)
# print('shape of combined GDF', combined_gdf.shape[0])

# create lat long columns and save to CSV
combined_gdf.set_index([f'{DATE_NAME_TRANSLATOR[temporal_res]}_id', 'geometry'], drop=True, inplace=True)

combined_gdf['geometry_col'] = combined_gdf.index.get_level_values(1)
combined_gdf = combined_gdf.set_geometry('geometry_col')

centers = combined_gdf.geometry.centroid
centers_latlong = centers.to_crs('EPSG:4326')
combined_gdf['lat'] = centers_latlong.y
combined_gdf['long'] = centers_latlong.x

if map_size != 'full':

    # print(combined_gdf)
    y_left_lower_line = MAP_SIZE_TRANSLATOR[map_size]['y_left_lower_line']
    y_right_lower_line = MAP_SIZE_TRANSLATOR[map_size]['y_right_lower_line']
    y_left_upper_line = MAP_SIZE_TRANSLATOR[map_size]['y_left_upper_line']
    y_right_upper_line = MAP_SIZE_TRANSLATOR[map_size]['y_right_upper_line']

    combined_gdf = cut_gpd_water(cut_gpd_water(combined_gdf, y1_up=y_left_lower_line, y2_up=y_right_lower_line, less=True), y1_up=y_left_upper_line, y2_up=y_right_upper_line, less=False)



AttributeError: 'DataFrame' object has no attribute 'crs'