In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
def clean_tree_data(df, state_name):
    # Convert 'observed_on' to datetime
    df['observed_on'] = pd.to_datetime(df['observed_on'], errors='coerce')

    # Remove future dates
    current_date = datetime.now()
    df = df[df['observed_on'] <= current_date]

    # Standardize species names
    tree_of_heaven_names = ['tree of heaven', 'tree-of-heaven', 'Tree-of-Heaven', 'Ailanthus altissima', 'Götterbaum', 'ailante glanduleux']
    df['species_guess'] = df['species_guess'].str.lower().str.strip()
    df.loc[df['species_guess'].isin([name.lower() for name in tree_of_heaven_names]), 'species_guess'] = 'tree-of-heaven'

    # Remove rows with missing values in important columns
    important_columns = ['id', 'observed_on', 'latitude', 'longitude', 'place_county_name', 'species_guess']
    df = df.dropna(subset=important_columns)

    # Remove duplicate entries
    df = df.drop_duplicates(subset='id')

    # Reset the index
    df = df.reset_index(drop=True)

    print(f"\n--- {state_name} Tree Data Statistics ---")
    print(f"Total number of tree observations: {len(df)}")
    print(f"Date range: {df['observed_on'].min()} to {df['observed_on'].max()}")
    print(f"Number of unique counties: {df['place_county_name'].nunique()}")
    print("\nTop 5 counties by number of observations:")
    print(df['place_county_name'].value_counts().head())
    print("\nNumber of observations by species:")
    print(df['species_guess'].value_counts())
    print(f"\nAverage positional accuracy: {df['positional_accuracy'].mean():.2f}")
    print(f"Median positional accuracy: {df['positional_accuracy'].median():.2f}")

    return df


In [None]:
md_tree_data = pd.read_csv('/Users/vasantsaladi/Documents/GitHub/air_map/ML/Data/marylandtreeml.csv')
md_tree_data_cleaned = clean_tree_data(md_tree_data, "Maryland")

In [None]:
va_tree_data = pd.read_csv('/Users/vasantsaladi/Documents/GitHub/air_map/ML/Data/virginiatreeml.csv')
va_tree_data_cleaned = clean_tree_data(va_tree_data, "Virginia")