In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def clean_tree_data(df, state_name):
    # Convert 'observed_on' to datetime
    df['observed_on'] = pd.to_datetime(df['observed_on'], errors='coerce')

    # Remove future dates
    current_date = datetime.now()
    df = df[df['observed_on'] <= current_date]

    # Standardize species names
    tree_of_heaven_names = ['tree of heaven', 'tree-of-heaven', 'Tree-of-Heaven', 'Ailanthus altissima', 'Götterbaum', 'ailante glanduleux']
    df['species_guess'] = df['species_guess'].str.lower().str.strip()
    df.loc[df['species_guess'].isin([name.lower() for name in tree_of_heaven_names]), 'species_guess'] = 'tree-of-heaven'

    # Remove rows with missing values in important columns
    important_columns = ['id', 'observed_on', 'latitude', 'longitude', 'place_county_name', 'species_guess']
    df = df.dropna(subset=important_columns)

    # Remove duplicate entries
    df = df.drop_duplicates(subset='id')

    # Reset the index
    df = df.reset_index(drop=True)

    print(f"\n--- {state_name} Tree Data Statistics ---")
    print(f"Total number of tree observations: {len(df)}")
    print(f"Date range: {df['observed_on'].min()} to {df['observed_on'].max()}")
    print(f"Number of unique counties: {df['place_county_name'].nunique()}")
    print("\nTop 5 counties by number of observations:")
    print(df['place_county_name'].value_counts().head())
    print("\nNumber of observations by species:")
    print(df['species_guess'].value_counts())
    print(f"\nAverage positional accuracy: {df['positional_accuracy'].mean():.2f}")
    print(f"Median positional accuracy: {df['positional_accuracy'].median():.2f}")

    return df


In [3]:
md_tree_data = pd.read_csv('/Users/vasantsaladi/Documents/GitHub/air_map/ML/Data/marylandtreeml.csv')
md_tree_data_cleaned = clean_tree_data(md_tree_data, "Maryland")


--- Maryland Tree Data Statistics ---
Total number of tree observations: 1619
Date range: 2011-06-18 00:00:00 to 2024-10-03 00:00:00
Number of unique counties: 23

Top 5 counties by number of observations:
place_county_name
Montgomery            277
Baltimore City, MD    258
Baltimore             169
Prince George's       155
Frederick             144
Name: count, dtype: int64

Number of observations by species:
species_guess
tree-of-heaven         1608
albero del paradiso       5
айлант высочайший         3
ailalt                    2
árbol del cielo           1
Name: count, dtype: int64

Average positional accuracy: 110.33
Median positional accuracy: 10.00


In [4]:
va_tree_data = pd.read_csv('/Users/vasantsaladi/Documents/GitHub/air_map/ML/Data/virginiatreeml.csv')
va_tree_data_cleaned = clean_tree_data(va_tree_data, "Virginia")


--- Virginia Tree Data Statistics ---
Total number of tree observations: 2450
Date range: 2013-08-10 00:00:00 to 2024-10-02 00:00:00
Number of unique counties: 112

Top 5 counties by number of observations:
place_county_name
Fairfax           517
Arlington         174
Loudoun           152
Richmond city     151
Prince William    113
Name: count, dtype: int64

Number of observations by species:
species_guess
tree-of-heaven         2431
айлант высочайший         8
albero del paradiso       4
árbol del cielo           2
vascular plants           2
dicots                    2
staghorn sumac            1
Name: count, dtype: int64

Average positional accuracy: 85.19
Median positional accuracy: 9.00


In [6]:
md_tree_data_cleaned.to_csv('/Users/vasantsaladi/Documents/GitHub/air_map/ML/Data/maryland_tree_data_cleaned.csv', index=False)
va_tree_data_cleaned.to_csv('/Users/vasantsaladi/Documents/GitHub/air_map/ML/Data/virginia_tree_data_cleaned.csv', index=False)
print("\nCleaned data saved to 'ML/Data/maryland_tree_data_cleaned.csv' and 'ML/Data/virginia_tree_data_cleaned.csv'")


Cleaned data saved to 'ML/Data/maryland_tree_data_cleaned.csv' and 'ML/Data/virginia_tree_data_cleaned.csv'
