In [2]:
# !git clone https://github.com/kailas93/Tree_species.git

Cloning into 'Tree_species'...
remote: Enumerating objects: 2750, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 2750 (delta 0), reused 69 (delta 0), pack-reused 2681 (from 1)[K
Receiving objects: 100% (2750/2750), 75.42 MiB | 45.11 MiB/s, done.
Resolving deltas: 100% (54/54), done.
Filtering content: 100% (64/64), 1.15 GiB | 68.60 MiB/s, done.


In [3]:
import pandas as pd
import glob
import os
import zipfile

# # # Unzip the dataset
# zip_path = "/content/archive(1).zip"
extract_dir = "5M_trees"

# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_dir)

# Define which columns to keep
selected_columns = [
    'common_name', 'scientific_name', 'city', 'state',
    'longitude_coordinate', 'latitude_coordinate', 'address', 'condition',
    'native', 'height_binned_M', 'diameter_breast_height_binned_CM',
    'location_type', 'zipcode', 'neighborhood', 'location_name', 'ward',
    'district', 'overhead_utility', 'diameter_breast_height_CM', 'height_M'
]

# Merge all CSVs except metadata
exclude_files = {'Column_Headers_Dryad.csv', 'README_Dryad.txt'}
csv_files = [f for f in glob.glob(os.path.join(extract_dir, "*.csv")) if os.path.basename(f) not in exclude_files]

df_list = []
for file in csv_files:
    df = pd.read_csv(file, low_memory=False)
    filtered_df = df[selected_columns].copy()
    df_list.append(filtered_df)

merged_df = pd.concat(df_list, ignore_index=True)
merged_df.insert(0, 'tree_id', ['tree_' + str(i) for i in range(1, len(merged_df) + 1)])


In [4]:
merged_df.isnull().sum()

tree_id                                   0
common_name                          892786
scientific_name                      529298
city                                   1899
state                                    31
longitude_coordinate                 751709
latitude_coordinate                  751578
address                             1254392
condition                           3038500
native                                    0
height_binned_M                     4996887
diameter_breast_height_binned_CM    1574755
location_type                       3469760
zipcode                             4770156
neighborhood                        5132935
location_name                       5310350
ward                                5384826
district                            5509706
overhead_utility                    5017235
diameter_breast_height_CM           2785979
height_M                            4951094
dtype: int64

In [5]:
# Drop columns with more than 3,038,500 missing values
threshold = 3038501
merged_df = merged_df.loc[:, merged_df.isnull().sum() <= threshold]


In [6]:
merged_df = merged_df.drop(columns=['diameter_breast_height_binned_CM'])


In [7]:
merged_df = merged_df.dropna(subset=[
    'common_name',
    'scientific_name',
    'longitude_coordinate',
    'latitude_coordinate',
    'condition',
    'diameter_breast_height_CM','address', 'city'
])


In [8]:
merged_df.isnull().sum()

tree_id                      0
common_name                  0
scientific_name              0
city                         0
state                        0
longitude_coordinate         0
latitude_coordinate          0
address                      0
condition                    0
native                       0
diameter_breast_height_CM    0
dtype: int64

In [9]:
# Step 0: Remove tree species with < 2 samples
species_counts = merged_df['common_name'].value_counts()
valid_species = species_counts[species_counts >= 3].index.tolist()

# Keep only valid species
filtered_df = merged_df[merged_df['common_name'].isin(valid_species)].copy()

# Verify the filtering worked
assert filtered_df['common_name'].value_counts().min() >= 3, "Still has species with < 2 samples!"

# Continue with filtered data
data = filtered_df.copy()


In [10]:
data['common_name'].value_counts()

common_name
London planetree             87070
Honeylocust                  75267
Norway maple                 65983
Pin oak                      64479
Callery pear                 62088
                             ...  
Neem tree                        3
Burgundy ussurian pear           3
Mancana ash                      3
White shield osage orange        3
Arizona ash                      3
Name: count, Length: 1497, dtype: int64

In [11]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Load data
df = filtered_df.copy()
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Load data
df = merged_df.copy()

# Optional: Simplify to genus
df['genus'] = df['scientific_name'].apply(lambda x: x.split()[0])

# Encode categorical variables (native, city, state)
df['native_encoded'] = df['native'].astype('category').cat.codes
df['city_encoded'] = df['city'].astype('category').cat.codes
df['state_encoded'] = df['state'].astype('category').cat.codes

# Features to use
feature_cols = ['latitude_coordinate', 'longitude_coordinate', 'diameter_breast_height_CM',
                'native_encoded', 'city_encoded', 'state_encoded']
X = df[feature_cols]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=50, algorithm='ball_tree')  # can tune n_neighbors
nn_model.fit(X_scaled)

# Prediction function
def recommend_species(lat, lon, diameter_cm, native, city, state, top_n=5):
    # Encode input
    native_code = df['native'].astype('category').cat.categories.get_loc(native)
    city_code = df['city'].astype('category').cat.categories.get_loc(city)
    state_code = df['state'].astype('category').cat.categories.get_loc(state)

    input_features = np.array([[lat, lon, diameter_cm, native_code, city_code, state_code]])
    input_scaled = scaler.transform(input_features)

    distances, indices = nn_model.kneighbors(input_scaled)

    # Get common names or genera from neighbors
    neighbors = df.iloc[indices[0]]
    species_counts = Counter(neighbors['common_name'])  # or use 'genus'

    # Top-N species
    top_species = species_counts.most_common(top_n)
    return top_species
# Optional: Simplify to genus
df['genus'] = df['scientific_name'].apply(lambda x: x.split()[0])

# Encode categorical variables (native, city, state)
df['native_encoded'] = df['native'].astype('category').cat.codes
df['city_encoded'] = df['city'].astype('category').cat.codes
df['state_encoded'] = df['state'].astype('category').cat.codes

# Features to use
feature_cols = ['latitude_coordinate', 'longitude_coordinate', 'diameter_breast_height_CM',
                'native_encoded', 'city_encoded', 'state_encoded']
X = df[feature_cols]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=50, algorithm='ball_tree')  # can tune n_neighbors
nn_model.fit(X_scaled)

# Prediction function
def recommend_species(lat, lon, diameter_cm, native, city, state, top_n=5):
    # Encode input
    native_code = df['native'].astype('category').cat.categories.get_loc(native)
    city_code = df['city'].astype('category').cat.categories.get_loc(city)
    state_code = df['state'].astype('category').cat.categories.get_loc(state)

    input_features = np.array([[lat, lon, diameter_cm, native_code, city_code, state_code]])
    input_scaled = scaler.transform(input_features)

    distances, indices = nn_model.kneighbors(input_scaled)

    # Get common names or genera from neighbors
    neighbors = df.iloc[indices[0]]
    species_counts = Counter(neighbors['common_name'])  # or use 'genus'

    # Top-N species
    top_species = species_counts.most_common(top_n)
    return top_species

In [12]:
# Example usage
recommendation = recommend_species(
    lat=38.2274,
    lon=-85.8009,
    diameter_cm=1.2,
    native='naturally_occurring',
    city='Louisville',
    state='Kentucky',
    top_n=5
)

for species, count in recommendation:
    print(f"{species} (seen {count} times nearby)")

Bur oak (seen 4 times nearby)
American yellowwood (seen 4 times nearby)
Eastern hophornbeam (seen 4 times nearby)
White oak (seen 3 times nearby)
Shingle oak (seen 3 times nearby)




In [15]:
from tqdm import tqdm
from collections import defaultdict

def evaluate_recommender(X_scaled, df, model, top_k=5, sample_size=1000):
    correct = 0
    ranks = []

    for i in tqdm(range(sample_size)):
        x_query = X_scaled[i].reshape(1, -1)
        distances, indices = model.kneighbors(x_query)

        # exclude itself
        neighbor_indices = [idx for idx in indices[0] if idx != i][:top_k]
        true_species = df.iloc[i]['common_name']
        neighbor_species = df.iloc[neighbor_indices]['common_name'].tolist()

        if true_species in neighbor_species:
            correct += 1
            ranks.append(neighbor_species.index(true_species) + 1)
        else:
            ranks.append(0)

    hit_rate = correct / sample_size
    mean_rank = sum([1/r for r in ranks if r > 0]) / sample_size

    print(f"Top-{top_k} Hit Rate: {hit_rate:.4f}")
    print(f"Mean Reciprocal Rank: {mean_rank:.4f}")
    return hit_rate, mean_rank

# Run evaluation on a 1000-sample subset
evaluate_recommender(X_scaled, df, nn_model, top_k=5, sample_size=1000)


100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 2199.88it/s]

Top-5 Hit Rate: 0.4990
Mean Reciprocal Rank: 0.3479





(0.499, 0.3479333333333333)

In [16]:
# Save scaler and model
import joblib
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(nn_model, 'nn_model.joblib')

# Also save the dataframe with encoded columns (needed for categories and lookup)
df.to_pickle('tree_data.pkl')

print("Saved scaler, model and data!")

Saved scaler, model and data!


In [17]:
def get_common_locations_for_species(tree_name, top_n=10):
    """
    Given a tree common name, return the top N most frequent locations.
    """
    species_df = df[df['common_name'] == tree_name]
    
    if species_df.empty:
        return f"No records found for species: {tree_name}"
    
    # You can group by city/state or full address
    location_counts = species_df.groupby(['city', 'state']) \
                                .size().reset_index(name='count') \
                                .sort_values(by='count', ascending=False) \
                                .head(top_n)
    
    return location_counts


In [18]:
# Example tree name
tree_name = 'Bur oak'
top_locations = get_common_locations_for_species(tree_name, top_n=5)

print(f"Top locations where '{tree_name}' is commonly found:")
print(top_locations)


Top locations where 'Bur oak' is commonly found:
            city                 state  count
5  Washington DC  District of Columbia    760
4       New York              New York    515
3     Louisville              Kentucky    490
0         Aurora              Colorado    239
2         Durham        North Carolina     69
