In [48]:
import ee
from datetime import datetime
import geopandas as gpd
import pandas as pd
import numpy as np
import folium
from shapely import wkt
import geemap
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [None]:
# Authenticate with Earth Engine (if needed)
def authenticate_ee():
    try:
        ee.Initialize()
        logging.info("Earth Engine API initialized successfully.")
    except ee.EEException:
        logging.warning("Authentication required. Please follow the instructions.")
        ee.Authenticate()
        ee.Initialize()
authenticate_ee()

In [11]:
CSB_ASSET = 'projects/nass-csb/assets/csb1623/CSBIL1623'

def get_fields_for_mclean():
    """Filter fields for McLean County, Illinois."""
    # Load the CSBAL1623 FeatureCollection
    field_boundaries = ee.FeatureCollection(CSB_ASSET)

    # Load the TIGER counties dataset and filter for McLean County in Illinois
    counties = ee.FeatureCollection('TIGER/2018/Counties')
    mclean_county = counties.filter(
        ee.Filter.And(
            ee.Filter.eq('NAME', 'McLean'),  # County name
            ee.Filter.eq('STATEFP', '17')   # Illinois FIPS code
        )
    )

    # Filter the field boundaries for features within McLean County and limit to 5000 features
    mclean_fields = field_boundaries.filterBounds(mclean_county.geometry()).limit(5000)
    return mclean_fields

def calculate_indices(sentinel_image):
    """Calculate NDVI, EVI, GNDVI, and SAVI from Sentinel-2 data."""
    ndvi = sentinel_image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    evi = sentinel_image.expression(
        '2.5 * ((B8 - B4) / (B8 + 6 * B4 - 7.5 * B2 + 1))',
        {'B8': sentinel_image.select('B8'), 'B4': sentinel_image.select('B4'), 'B2': sentinel_image.select('B2')}
    ).rename('EVI')
    gndvi = sentinel_image.normalizedDifference(['B8', 'B3']).rename('GNDVI')
    savi = sentinel_image.expression(
        '((B8 - B4) / (B8 + B4 + 0.5)) * 1.5',
        {'B8': sentinel_image.select('B8'), 'B4': sentinel_image.select('B4')}
    ).rename('SAVI')

    return ndvi.addBands([evi, gndvi, savi])

def get_satellite_indices_fast(fields_fc, start_date, end_date):
    """Get satellite indices for all fields in the FeatureCollection."""
    # Filter Sentinel-2 data for the specified weeklong range
    sentinel = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
        .filterDate(start_date, end_date) \
        .filterBounds(fields_fc.geometry()) \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10)) \
        .median()  # Composite for the weeklong range

    # Calculate satellite indices
    indices = calculate_indices(sentinel)

    # Use reduceRegions to calculate mean indices for all fields at once
    zonal_stats = indices.reduceRegions(
        collection=fields_fc,
        reducer=ee.Reducer.mean(),
        scale=10
    )

    return zonal_stats

def feature_collection_to_dataframe(fc):
    """Convert a FeatureCollection to a Pandas DataFrame."""
    features = fc.getInfo()['features']
    data = [
        {
            **f['properties'],  # Field properties
            'geometry': f['geometry']  # GeoJSON geometry
        }
        for f in features
    ]
    return pd.DataFrame(data)


# Define the weeklong date range in July
start_date = '2022-07-10'
end_date = '2022-07-17'

# Get McLean County fields
mclean_fields = get_fields_for_mclean()

# Get satellite indices for all fields using reduceRegions
zonal_stats = get_satellite_indices_fast(mclean_fields, start_date, end_date)

# Convert to DataFrame
df = feature_collection_to_dataframe(zonal_stats)

In [30]:
df

Unnamed: 0,ACRES,ASD,CDL2016,CDL2017,CDL2018,CDL2019,CDL2020,CDL2021,CDL2022,CDL2023,...,NDVI,SAVI,STATEASD,STATEFIPS,Shape_Area,Shape_Leng,endyear,fid,startyear,geometry
0,10.29,40,1,1,1,5,5,1,5,1,...,0.901347,1.351892,1740,17,41633.197383,1144.679065,2023,473554,2016,"{'type': 'Polygon', 'coordinates': [[[-88.8810..."
1,4.84,40,5,1,1,5,5,1,5,1,...,0.861463,1.292056,1740,17,19606.222406,711.658699,2023,473605,2016,"{'type': 'Polygon', 'coordinates': [[[-88.9219..."
2,70.88,40,5,1,1,1,5,1,5,1,...,0.845274,1.267773,1740,17,286852.759480,3120.525328,2023,473612,2016,"{'type': 'Polygon', 'coordinates': [[[-88.9219..."
3,6.57,40,1,1,1,1,5,1,5,1,...,0.617462,0.926069,1740,17,26571.757732,1046.591030,2023,473649,2016,"{'type': 'Polygon', 'coordinates': [[[-88.8713..."
4,39.43,40,1,1,1,1,1,1,5,1,...,0.923212,1.384709,1740,17,159557.501086,1629.543342,2023,473697,2016,"{'type': 'Polygon', 'coordinates': [[[-88.9428..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,22.84,40,1,5,1,5,1,5,1,5,...,0.929548,1.394178,1740,17,92436.675551,1534.271639,2023,67433,2016,"{'type': 'Polygon', 'coordinates': [[[-89.1047..."
4996,4.24,40,1,5,1,5,1,5,5,36,...,0.851090,1.276489,1740,17,17144.655228,714.254829,2023,67434,2016,"{'type': 'Polygon', 'coordinates': [[[-89.0954..."
4997,16.40,40,1,5,1,5,1,5,1,5,...,0.884378,1.326428,1740,17,66378.488939,1378.767695,2023,67437,2016,"{'type': 'Polygon', 'coordinates': [[[-89.0952..."
4998,73.77,40,229,5,1,1,1,5,1,1,...,0.857586,1.286247,1740,17,298557.001153,2550.963309,2023,67445,2016,"{'type': 'Polygon', 'coordinates': [[[-89.0481..."


In [35]:
data = df[['CDL2022','EVI','GNDVI','NDVI','SAVI']]
data.columns = data.columns.str.lower().str.replace(' ', '_')
data.cdl2022 = data.cdl2022.map(lambda x: 1 if x == 1 else 0)
data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.cdl2022 = data.cdl2022.map(lambda x: 1 if x == 1 else 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)


In [36]:
def train_test_val(df, y, test=0.2, val=0.2, random_state=1):
    df_full_train, df_test = train_test_split(df, test_size=test, random_state=random_state)
    df_train, df_val = train_test_split(df_full_train, test_size= val/(1-test) , random_state=random_state)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    y_train = df_train[y].values
    y_val = df_val[y].values
    y_test = df_test[y].values
    del df_train[y]
    del df_val[y]
    del df_test[y]
    return df_train, df_val, df_test, y_train, y_val, y_test

In [40]:
df_train, df_val, df_test, y_train, y_val, y_test = train_test_val(data, 'cdl2022')
# One-hot encoding of variables
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [41]:
# Training logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=2)
model.fit(X_train, y_train)

# Calculating accuracy
y_pred = model.predict_proba(X_val)[:, 1]
corn = (y_pred) >= 0.5

In [43]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
auc(fpr,tpr)

0.6107272508714989