### Create geo samples for testing on the leafsnap predictions

### IMPORTANT Run half of train_test_geo_leafsnap.ipynb first

In [1]:
import scipy
import numpy as np
import scipy.spatial as spatial
import matplotlib.pyplot as plt
import random 
import pandas as pd
import os
from mpl_toolkits.basemap import Basemap
from sklearn.datasets import fetch_species_distributions
from sklearn.neighbors import KernelDensity
import copy

from tqdm import tqdm

import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.optim as optim
import torchvision
import torchvision.models as models

from pyproj import Proj, transform

from scipy.stats import norm
import warnings
from sklearn.neighbors import KNeighborsClassifier
import imblearn
import re

import utils


#copied from Kevin Barnes/kbarnes3: https://gist.github.com/kbarnes3/3fb7d353e9bdd3efccd5

import ctypes
import platform

ES_CONTINUOUS = 0x80000000
ES_SYSTEM_REQUIRED = 0x00000001


def _set_thread_execution(state):
    ctypes.windll.kernel32.SetThreadExecutionState(state)


def prevent_standby():
    if platform.system() == 'Windows':
        _set_thread_execution(ES_CONTINUOUS | ES_SYSTEM_REQUIRED)


def allow_standby():
    if platform.system() == 'Windows':
        _set_thread_execution(ES_CONTINUOUS)


def long_running(func):
    def inner(*args, **kwargs):
        prevent_standby()
        result = func(*args, **kwargs)
        allow_standby()
        return result
    return inner




In [None]:

model_name = "resnet18_pretrained2911"

In [None]:
def convert_utm_to_latlon(df, zone_number, zone_letter):
    utm_proj = Proj(proj='utm', zone=zone_number, ellps='WGS84', south=(zone_letter < 'N'))
    lonlat_proj = Proj(proj='latlong', datum='WGS84')
    lon, lat = transform(utm_proj, lonlat_proj, df['Østkoordinat'].values, df['Nordkoordinat'].values)
    
    return pd.DataFrame({'Longitude': lon, 'Latitude': lat})

df_latlon = convert_utm_to_latlon(dat, 33, 'N') 



dat = pd.read_csv('dataset/location_data/location_top_185.csv')
dat = dat[['Id','Østkoordinat','Nordkoordinat','Vitenskapelig navn']]


dat['points']= [[i,j] for i, j in zip(dat.Østkoordinat, dat.Nordkoordinat)]

df_latlon = convert_utm_to_latlon(dat, 33, 'N') 


dat['lat']=df_latlon['Latitude']
dat['long']=df_latlon['Longitude']



lat_long_oslo = [(58.998141, 9.574585), (60.351413, 9.574585), (60.351413, 12.540894),(58.998141,12.540894)]



dat = dat.loc[(dat['lat']>lat_long_oslo[0][0])  & (dat['lat']<lat_long_oslo[1][0]) & (dat['long']>lat_long_oslo[0][1 ]) & (dat['long']<lat_long_oslo[2][1])]




dat.to_csv('dataset/top185_in_oslo_area.csv')






#oslo area:
#lowerleft, upperleft, upper right, lower right,




points = dat[['Østkoordinat','Nordkoordinat']].to_numpy()




point_tree = spatial.cKDTree(points)


science_names = dat['Vitenskapelig navn'].unique()
indexes = [i for i in range(len(dat['Vitenskapelig navn'].unique()))]


random.seed(10)

random.shuffle(indexes)


names_mapping = {science_name : index for (science_name,index) in zip(science_names,indexes)}
index_mapping = {index : science_name for (science_name,index) in zip(science_names,indexes)}

def distance_between_points(point, list_of_points):
    return [np.sqrt(np.power(point[0]-lop[0],2)+np.power(point[1]-lop[1],2)) for lop in list_of_points]



def get_points_within(df_row, distance=1000):
    id = int(df_row['Id'].iloc[0])
    return_list = point_tree.query_ball_point([[int(df_row['Østkoordinat'].iloc[0]),int(df_row['Nordkoordinat'].iloc[0])]], distance)[0]
    return_dat = dat.iloc[return_list]
    return_list = list(return_dat['Id'])
    return_list.remove(id)
    return return_list
    
def sample_plant_position(plant,df):
    if type(plant) == str:
        return dat[dat['Vitenskapelig navn'] == plant].sample(1)
    elif type(plant) == int:
        return dat[dat['Vitenskapelig navn'] == index_mapping[plant]].sample(1)

def kernel_density_estimate_value(point_row,dat,bandwidth = 500):
    # print(point_row)
    # print(dat)
    if point_row.index[0] in list(dat.index):
        np_dat_lat_long = dat.drop(point_row.index[0])[['Østkoordinat','Nordkoordinat']].to_numpy()
    else:
        np_dat_lat_long = dat[['Østkoordinat','Nordkoordinat']].to_numpy()
    kde = KernelDensity(bandwidth=bandwidth)
    # print(np_dat_lat_long)
    if len(np_dat_lat_long)==0:
        return 0
    kde.fit(np_dat_lat_long)
    np_point = np.array([[point_row['Østkoordinat'].iloc[0],point_row['Nordkoordinat'].iloc[0]]])
    return np.exp(kde.score_samples(np_point))[0]

def get_points_within_square(point, dat,side_length = 3000):
    return_dat = dat[dat['Østkoordinat']>point['Østkoordinat'].iloc[0]-side_length]
    return_dat = return_dat[return_dat['Østkoordinat']<point['Østkoordinat'].iloc[0]+ side_length]
    return_dat = return_dat[return_dat['Nordkoordinat']>point['Nordkoordinat'].iloc[0]- side_length]
    return_dat = return_dat[return_dat['Nordkoordinat']<point['Nordkoordinat'].iloc[0]+ side_length]
    return return_dat

# within_square =get_points_within_square(v,dat)

# within_square







def get_points_within(df_row, distance=1000):
    id = int(df_row['Id'].iloc[0])
    return_list = point_tree.query_ball_point([[int(df_row['Østkoordinat'].iloc[0]),int(df_row['Nordkoordinat'].iloc[0])]], distance)[0]
    return_dat = dat.iloc[return_list]
    return_list = list(return_dat['Id'])
    return_list.remove(id)
    return return_list
    
def sample_plant_position(plant,df):
    if type(plant) == str:
        return dat[dat['Vitenskapelig navn'] == plant].sample(1)
    elif type(plant) == int:
        return dat[dat['Vitenskapelig navn'] == index_mapping[plant]].sample(1)




def get_knn_classifier(samples_pos_list,dat,n=1000):
    sample_pos_indexes = [sp.index[0] for sp in samples_pos_list if sp.index[0] in list(dat.index)]
    
    dat_removed_samples = dat.drop(index=sample_pos_indexes)
    category = []
    for k in dat_removed_samples['Vitenskapelig navn']:
        category.append(names_mapping[k])
        
    category = np.array(category)
    points_np= np.array(list(dat_removed_samples['points']))
    oversample = imblearn.over_sampling.KMeansSMOTE()
    warnings.filterwarnings("ignore")
    X, y = oversample.fit_resample(points_np, category) 
    warnings.filterwarnings("default")
    
    neigh = KNeighborsClassifier(n_neighbors=n)
    neigh.fit(X, y)
    return neigh

In [None]:

def augment_output(output, augment):
    output_aug = torch.nn.functional.softmax(output, dim=1)
    output_aug = output_aug * augment
    output_aug = output_aug * 1/torch.sum(output_aug) 
    return torch.log(output_aug)

def augment_output2(output, augment, zero_tensor_kde001):
    output_aug = torch.nn.functional.softmax(output, dim=1)
    augment = augment + zero_tensor_kde001*augment.min()
    output_aug = output_aug + augment
    output_aug = output_aug * 1/torch.sum(output_aug) 
    
    return torch.log(output_aug)

def kde_augmentet_output(sample_pos_list,output, dat, bandwidth):
    
    weight_tensor_kde = np.zeros((output.size(0),185))
    zero_tensor_kde = np.ones((output.size(0),185))
    for j, sample_pos in enumerate(sample_pos_list):
        within_square =get_points_within_square(sample_pos,dat)
        for plant_name in within_square.value_counts('Vitenskapelig navn').index:
            within_square_ = within_square[within_square['Vitenskapelig navn']==plant_name]
        
            pj = names_mapping[plant_name]
            pj_value = kernel_density_estimate_value(sample_pos,within_square_,bandwidth = bandwidth)
            weight_tensor_kde[j][pj] = pj_value
            zero_tensor_kde[j][pj] = 0
    weight_tensor_kde = torch.tensor(weight_tensor_kde)
    zero_tensor_kde = torch.tensor(zero_tensor_kde)
    
    output_aug_kde = augment_output2(output,weight_tensor_kde,zero_tensor_kde)

    return output_aug_kde
    
stats_distance1 = norm(
    loc=0, 
    scale=250
)

stats_distance2 = norm(
    loc=0, 
    scale=500
)

In [None]:
##making random index mix

random.seed(10)
names_mapping = []
index_mapping = []
for i in range(20):
    random.shuffle(indexes)

    names_mapping.append({science_name : index for (science_name,index) in zip(science_names,indexes)})
    index_mapping.append({index : science_name for (science_name,index) in zip(science_names,indexes)})


In [None]:
output_files = [ i for i in os.listdir(f'saved_output/{model_name}_validating_output/') if 'output' in i]
output_files
max_b=0
max_e = 0
for output_file in output_files:
    of = re.findall('\d+$',output_file)
    if int(of[0]) > max_b:
        max_b = int(of[0])
    a= output_file.split('_')
    if int(a[1][1:]) >= max_e:
        max_e = int(a[1][1:])
    


In [None]:

def sample_plant_position(plant,df,index):
 
    if type(plant) == str:
        return dat[dat['Vitenskapelig navn'] == plant].sample(1)
    elif type(plant) == int:
        return dat[dat['Vitenskapelig navn'] == index_mapping[index][plant]].sample(1)



random.seed(10)
names_mapping = []
index_mapping = []
for i in range(20):
    random.shuffle(indexes)
    names_mapping.append({science_name : index for (science_name,index) in zip(science_names,indexes)})
    index_mapping.append({index : science_name for (science_name,index) in zip(science_names,indexes)})

    
@long_running
def create_samples_and_points_around(start_batch):
    for e in range(1):#max_e+1):
        for b in tqdm(range(max_b+1)):
            output = torch.load(f'saved_output/{model_name}_validating_output/output_e{e}_b{b}')
            target = torch.load(f'saved_output/{model_name}_validating_output/target_b{b}')
            sample_pos_list=[]
            weight_tensors = [] 
            if b >=start_batch:
                for i in tqdm(range(20)):

                    for meter in [2000]:#500,600,700,800,900,1000,1100,1200,1300,1400,1500]:
                        random.seed(11)
                        weight_tensor = np.ones((output.size(0),185))*0.1
                        sample_pos_list = []
                        for j in range(output.size(0)):

                            sample_pos = sample_plant_position(int(target[j]),dat,i)

                            sample_pos_list.append(sample_pos)


                        for sample_pos in sample_pos_list:
                            points_in_area_list = get_points_within(sample_pos,meter)
                            points_in_area = dat[dat['Id'].isin(points_in_area_list)]

                            indexes_in_area = [ names_mapping[i][ii] for ii in list(points_in_area['Vitenskapelig navn'].unique())]

                            for k in indexes_in_area:
                                weight_tensor[j][k]=1

                        
                        weight_tensor = torch.from_numpy(weight_tensor)

                        torch.save(weight_tensor,f'saved_output/geo_functions/sampled_{meter}m_batch{b}_cofiguration{i}')

                            # print(weight_tensors)
        # kde_augmentet_output()
        
create_samples_and_points_around(0)

In [None]:
#
#
#
#
#

def get_points_within_square(point, dat,side_length = 3000):
    return_dat = dat[dat['Østkoordinat']>point['Østkoordinat'].iloc[0]-side_length]
    return_dat = return_dat[return_dat['Østkoordinat']<point['Østkoordinat'].iloc[0]+ side_length]
    return_dat = return_dat[return_dat['Nordkoordinat']>point['Nordkoordinat'].iloc[0]- side_length]
    return_dat = return_dat[return_dat['Nordkoordinat']<point['Nordkoordinat'].iloc[0]+ side_length]
    return return_dat

random.seed(10)
names_mapping = []
index_mapping = []
for i in range(3):
    random.shuffle(indexes)
    names_mapping.append({science_name : index for (science_name,index) in zip(science_names,indexes)})
    index_mapping.append({index : science_name for (science_name,index) in zip(science_names,indexes)})
    
@long_running
def kde_samples_output(batch,configuration):
    for e in range(1):#max_e+1):
        for b in tqdm(range(max_b+1)):
            output = torch.load(f'saved_output/{model_name}_validating_output/output_e{e}_b{b}')
            target = torch.load(f'saved_output/{model_name}_validating_output/target_b{b}')
            sample_pos_list=[]
            weight_tensors = [] 

            for i in tqdm(range(3)):
                random.seed(11)
                sample_pos_list = []
                if b>=batch :
                    for j in range(output.size(0)):
                        sample_pos = sample_plant_position(int(target[j]),dat,i)

                        sample_pos_list.append(sample_pos)
                    for side_len in [1000,2000,3000,4000]:
                        within_square =get_points_within_square(sample_pos,dat,side_length = side_len)
                        within_square = within_square[within_square.Id != sample_pos['Id'].iloc[0]]
                        
                        for bandwidth in [3000,4000]:
                            
                            weight_tensor_kde = np.zeros((output.size(0),185))
                            for j, sample_pos in enumerate(sample_pos_list):

                                for plant_name in within_square.value_counts('Vitenskapelig navn').index:

                                    within_square_ = within_square[within_square['Vitenskapelig navn']==plant_name]

                                    pj = names_mapping[i][plant_name]
                                    pj_value = kernel_density_estimate_value(sample_pos,within_square_,bandwidth = bandwidth)
                                    weight_tensor_kde[j][pj] = pj_value

                            weight_tensor_kde = torch.from_numpy(weight_tensor_kde)
                            print(f'geo_functions_validating_output/kde_side_len{side_len}m_bandwitdth{bandwidth}_batch{b}_cofiguration{i}')
                            torch.save(weight_tensor_kde,f'saved_output/geo_functions/kde_side_len{side_len}m_bandwitdth{bandwidth}_batch{b}_cofiguration{i}')
                            
# kde_samples_output(0,0)

In [None]:
def get_knn_classifier(samples_pos_list,dat,n=1000):
    sample_pos_indexes = [sp.index[0] for sp in samples_pos_list if sp.index[0] in list(dat.index)]
    
    dat_removed_samples = dat.drop(index=sample_pos_indexes)
    category = []
    for k in dat_removed_samples['Vitenskapelig navn']:
        category.append(names_mapping[k])
        
    category = np.array(category)
    points_np= np.array(list(dat_removed_samples['points']))
    oversample = imblearn.over_sampling.KMeansSMOTE()
    warnings.filterwarnings("ignore")
    X, y = oversample.fit_resample(points_np, category) 
    warnings.filterwarnings("default")
    
    neigh = KNeighborsClassifier(n_neighbors=n)
    neigh.fit(X, y)
    return neigh

def get_knn_classifier_smote(samples_pos_list,dat,n_list,configuration):
    i = configuration
    sample_pos_indexes = [sp.index[0] for sp in samples_pos_list if sp.index[0] in list(dat.index)]
    
    dat_removed_samples = dat.drop(index=sample_pos_indexes)
    category = []
    for k in dat_removed_samples['Vitenskapelig navn']:
        category.append(names_mapping[i][k])
        
    category = np.array(category)
    points_np= np.array(list(dat_removed_samples['points']))
    oversample = imblearn.over_sampling.KMeansSMOTE()
    warnings.filterwarnings("ignore")
    X, y = oversample.fit_resample(points_np, category) 
    warnings.filterwarnings("default")
    neigh=[]
    for n in n_list:
        neigh.append(KNeighborsClassifier(n_neighbors=n))
    for knn in neigh:
        knn.fit(X, y)
    return neigh

In [None]:
get_knn_classifier

In [None]:
sample_pos = sample_plant_position(0,dat,0)

In [None]:
n_list = [10,15,20]#25,50,75]#[100,200,500,1000,1500,2000,2500,3000]

# n_list = [1000

random.seed(10)
names_mapping = []
index_mapping = []
for i in range(3):
    random.shuffle(indexes)
    names_mapping.append({science_name : index for (science_name,index) in zip(science_names,indexes)})
    index_mapping.append({index : science_name for (science_name,index) in zip(science_names,indexes)})
    
@long_running
def knn_smote_samples_output(batch,n_list):
    for e in range(1):#max_e+1):
        for b in tqdm(range(max_b+1)):
            output = torch.load(f'saved_output/{model_name}_validating_output/output_e{e}_b{b}')
            target = torch.load(f'saved_output/{model_name}_validating_output/target_b{b}')
            sample_pos_list=[]
            weight_tensors = [] 
            if b >= batch: 
                for i in tqdm(range(3)):
                    random.seed(11)
                    sample_pos_list = []
                    # if b>=batch and i !=configuration:

                    for j in range(output.size(0)):
                        sample_pos = sample_plant_position(int(target[j]),dat,i)

                        sample_pos_list.append(sample_pos)

                    knns = get_knn_classifier_smote(sample_pos_list,dat,n_list,i)
                    weight_tensor_knn = np.zeros((output.size(0),185))

                    for n in range(len(n_list)):
                        weight_tensor_knn = np.zeros((output.size(0),185))
                        for j, sample_pos in enumerate(sample_pos_list):
                            weight_tensor_knn[j]=knns[0].predict_proba(list(sample_pos['points']))

                        weight_tensor_knn = torch.from_numpy(weight_tensor_knn)
                        print(f'saved_output/{model_name}_validating_output/knn_smote_n_{n_list[n]}_batch{b}_cofiguration{i}')
                        torch.save(weight_tensor_knn,f'saved_output/geo_functions/knn_smote_n_{n_list[n]}_batch{b}_cofiguration{i}')

knn_smote_samples_output(14,n_list)

In [None]:
#multinomial logistic regression:

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

def sample_plant_position(plant,df):
    if type(plant) == str:
        return dat[dat['Vitenskapelig navn'] == plant].sample(1)
    elif type(plant) == int:
        return dat[dat['Vitenskapelig navn'] == index_mapping[plant]].sample(1)

def get_points_within_df(df_row, distance=1000):
    id = int(df_row['Id'].iloc[0])
    return_list = point_tree.query_ball_point([[int(df_row['Østkoordinat'].iloc[0]),int(df_row['Nordkoordinat'].iloc[0])]], distance)[0]
    return_dat = dat.iloc[return_list]
    
    # return_list = list(return_dat['Id'])
    # return_list.remove(id)
    return return_dat[~return_dat['Id'].isin([id])]

def relative_count_in_from_df(df_, name_to_index_dict):
    list_science_names = list(df_['Vitenskapelig navn'])
    count_list = [0 for i in range(185)]
    sum_count = len(list_science_names)
    for lsn in list_science_names:
        count_list[name_to_index_dict[lsn]] += 1/sum_count

    return count_list

science_names = dat['Vitenskapelig navn'].unique()
indexes = [i for i in range(len(dat['Vitenskapelig navn'].unique()))]


random.seed(10)

random.shuffle(indexes)


names_mapping = {science_name : index for (science_name,index) in zip(science_names,indexes)}
index_mapping = {index : science_name for (science_name,index) in zip(science_names,indexes)}
    
radius = 1000

radius_list = [250,500,750,1000,2000]
samples_per_species = 100

@long_running
def create_multinomial_logistic_regressions(radius_list,samples_per_species=100):
    for radius in radius_list:
        random.seed(1810)

        sample_list = []
        samples_from_areas = []
        target_list = []
        x_list = []
        for i in tqdm(range(185)):
            sample_list =[]
            samples_from_areas = []
            for j in range(samples_per_species):
                target_list.append(i)
                sample_list.append(sample_plant_position(i,dat))

            for sample in sample_list:
                samples_from_areas.append(get_points_within_df(sample,radius))

        for sfa in samples_from_areas:
            x_list.append(relative_count_in_from_df(sfa,names_mapping))

        model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        model.fit(x_list, target_list)
        np.save(f'C:/Users/vjosv/master/Deep-Leafsnap/saved_models/model_coefficients_r{radius}_shape{np.array(x_list).shape}', model.coef_)

        np.save(f'C:/Users/vjosv/master/Deep-Leafsnap/saved_models/model_intercept_r{radius}_shape{np.array(x_list).shape}',model.intercept_)

create_multinomial_logistic_regressions(radius_list)

create_multinomial_logistic_regressions(radius_list,200)
# for b in tqdm(range(max_b+1)):
#     output = torch.load(f'saved_output/{model_name}_validating_output/output_e{e}_b{b}')
#     target = torch.load(f'saved_output/{model_name}_validating_output/target_b{b}')
#     sample_pos_list=[]
#     weight_tensors = [] 
#     if b >= batch: 
#         for i in tqdm(range(3)):
#             random.seed(11)
#             sample_pos_list = []
#             # if b>=batch and i !=configuration:

#             for j in range(output.size(0)):
#                 sample_pos = sample_plant_position(int(target[j]),dat,i)

#                 sample_pos_list.append(sample_pos)

