In [1]:
import os 
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load the dataset

### Cross

In [2]:
data_cross = pd.read_csv('../data/stdl_cross.csv')
data_cross.head(3)

Unnamed: 0,RELI,EAST,NORTH,LU4,LC4,LU3,LC3,LU2,LC2,LU1,LC1,training
0,74222228,2742200,1222800,242,21,242,21,242,21,242,21,0
1,75392541,2753900,1254100,301,41,301,41,301,41,301,41,0
2,73712628,2737100,1262800,223,46,223,46,223,46,223,46,0


In [3]:
nbr_train_cross = data_cross[['training']].sum()
print('Number of training data: {}\nNumber of all data: {}\nRatio: {:.2f}%'.format(int(nbr_train_cross), len(data_cross), float(100 * nbr_train_cross/len(data_cross))))

Number of training data: 348868
Number of all data: 1380491
Ratio: 25.27%


### Square

In [4]:
data_squre = pd.read_csv('../data/stdl_square.csv')
data_squre.head(3)

Unnamed: 0,RELI,EAST,NORTH,LU4,LC4,LU3,LC3,LU2,LC2,LU1,LC1,training
0,74222228,2742200,1222800,242,21,242,21,242,21,242,21,0
1,75392541,2753900,1254100,301,41,301,41,301,41,301,41,0
2,74422546,2744200,1254600,301,41,301,41,301,41,301,41,0


In [5]:
nbr_train_square = data_squre[['training']].sum()
print('Number of training data: {}\nNumber of all data: {}\nRatio: {:.2f}%'.format(int(nbr_train_square), len(data_squre), float(100 * nbr_train_square/len(data_squre))))

Number of training data: 348868
Number of all data: 2070419
Ratio: 16.85%


### All points

In [6]:
data_all = pd.read_csv('../data/arealstatistik_all_points_only_classes.csv')
data_all.head(3)

Unnamed: 0,RELI,EAST,NORTH,LU4,LC4,LU3,LC3,LU2,LC2,LU1,LC1
0,74222228,2742200,1222800,242,21,242,21,242,21,242,21
1,76582227,2765800,1222700,301,45,301,45,301,45,301,45
2,72042228,2720400,1222800,222,21,222,21,222,21,222,21


In [7]:
print('Number of all data: {}'.format(len(data_all)))

Number of all data: 4163496


# Data Preparation

## Describe Labels

The labels of land cover and land usage have 27 and 46 classes respectively. This is the dataframe for inspecting the detailed explanation of each class.

In [8]:
prinp_domain = ['Artificial_areas'] * 7 + ['Grass_and_herb_vegetation'] + ['Brush_vegetation'] * 5 + ['Tree_vegetation'] * 7 + ['Bare_land'] * 3 + ['Watery areas'] * 4

basic_cat = ['Consolidated_surfaces', 'Buildings', 'Greenhouses', 'Gardens_with_border_and_patch_structures', 'Lawns',\
             'Trees_in_artificial_areas', 'Mix_of_small_structures', 'Grass_and_herb_vegetation', 'Shrubs', 'Brush_meadows',\
             'Short-stem_fruit_trees', 'Vines', 'Permanent_garden_plants_and_brush_crops', 'Closed_forest', 'Forest_edges',\
             'Forest_strips', 'Open_forest', 'Brush_forest', 'Linear_woods', 'Clusters_of_trees', 'Solid_rock', 'Granular_soil',\
             'Rocky_areas', 'Water', 'Glacier_perpetual_snow', 'Wetlands', 'Reedy_marshes']
                          
labels_int = list(range(11, 18)) + [21] + list(range(31, 36)) + list(range(41, 48)) + list(range(51, 54)) + list(range(61, 65))
labels_str = ['LC' + str(lab_int) for lab_int in labels_int]

lc04_dict = {
    'Principal_domain': prinp_domain,
    'Basic_category': basic_cat,
    'Label': labels_str
}

LC04_df = pd.DataFrame(lc04_dict, index=labels_int)
# LC04_df.to_csv('./data/Land_Cover_04_label.csv', index=True)
LC04_df.head(5)

Unnamed: 0,Principal_domain,Basic_category,Label
11,Artificial_areas,Consolidated_surfaces,LC11
12,Artificial_areas,Buildings,LC12
13,Artificial_areas,Greenhouses,LC13
14,Artificial_areas,Gardens_with_border_and_patch_structures,LC14
15,Artificial_areas,Lawns,LC15


In [9]:
prinp_domain = ['Settlement_and_urban areas'] * 26 + ['Agricultural_areas'] * 9 + ['Forest_areas'] * 4 + ['Unproductive_areas'] * 7
classes = [' Building_areas'] * 8 + ['Transport_surfaces'] * 5 + ['Special_urban_areas'] * 7 + ['Recreational_areas_and_cemeteries'] * 6 +\
['Orchards_vineyards_horticulture'] * 3 + ['Arable_and_Grassland'] * 3 + ['Alpine_grazing_areas'] * 3 + ['Forest_not_agricultural'] * 4 +\
['Lakes_and_rivers'] * 3 + ['Unproductive_land'] * 4

basic_cat = ['Industrial_and_commercial_areas_greater_than_1_ha', 'Industrial_and_commercial_areas_less_than_1_ha', 'Residential_areas_one_and_two-family_houses',\
             'Residential_areas_terraced_houses', 'Residential_areas_blocks_of_flats', 'Public_buildings_and_surroundings', 'Agricultural_buildings_and_surroundings',\
             'Unspecified_buildings_and_surroundings', 'Motorways', 'Roads', 'Parking_areas', 'Railway_surfaces', 'Airports_and_airfields', 'Energy_supply_plants',\
             'Waste_water_treatment_plants', 'Other_supply_or_waste_treatment_plants', 'Dumps', 'Quarries_mines', 'Construction_sites', 'Unexploited_urban_areas',\
             'Public_parks', 'Sports_facilities', 'Golf_courses', 'Camping_areas', 'Garden_allotments', 'Cemeteries', 'Orchards', 'Vineyards', 'Horticulture',\
             'Arable_land_in_general', 'Semi-natural_grassland_in_general', 'Farm_pastures_in_general', 'Alpine_meadows_in_general', 'Alpine_pastures_in_general',\
             'Alpine_sheep_grazing_pastures_in_general', 'Forest', 'Afforestation', 'Lumbering_areas', 'Damaged_forest', 'Lakes', 'Rivers_streams',\
             'Flood_protection_structures', 'Unused', 'Avalanche_and_rockfall_protection_structures', 'Alpine_sports_facilities', 'Landscape_interventions'
            ]
                          
labels_int = list(range(101, 109)) + list(range(121, 126)) + list(range(141, 148)) + list(range(161, 167)) + list(range(201, 204)) + list(range(221, 224)) +\
             list(range(241, 244)) + list(range(301, 305)) + list(range(401, 404)) + list(range(421, 425))
labels_str = ['LU' + str(lab_int) for lab_int in labels_int]

lu04_dict = {
    'Principal_domain': prinp_domain,
    'classes': classes,
    'Basic_category': basic_cat,
    'Label': labels_str
}

LU04_df = pd.DataFrame(lu04_dict, index=labels_int)
# LU04_df.to_csv('./data/Land_Usage_04_label.csv', index=True)
LU04_df.head(5)

Unnamed: 0,Principal_domain,classes,Basic_category,Label
101,Settlement_and_urban areas,Building_areas,Industrial_and_commercial_areas_greater_than_1_ha,LU101
102,Settlement_and_urban areas,Building_areas,Industrial_and_commercial_areas_less_than_1_ha,LU102
103,Settlement_and_urban areas,Building_areas,Residential_areas_one_and_two-family_houses,LU103
104,Settlement_and_urban areas,Building_areas,Residential_areas_terraced_houses,LU104
105,Settlement_and_urban areas,Building_areas,Residential_areas_blocks_of_flats,LU105


## Build Dataset with neighbour records

In [10]:
# choose which dataset to build
data_origin = data_all
data_origin.head()

Unnamed: 0,RELI,EAST,NORTH,LU4,LC4,LU3,LC3,LU2,LC2,LU1,LC1,training
0,74222228,2742200,1222800,242,21,242,21,242,21,242,21,0
1,75392541,2753900,1254100,301,41,301,41,301,41,301,41,0
2,74422546,2744200,1254600,301,41,301,41,301,41,301,41,0
3,73462540,2734600,1254000,222,21,222,21,222,21,222,21,0
4,76542541,2765400,1254100,221,21,221,21,221,21,221,21,0


In [12]:
# select the training rows and reset index
train_data = data_origin
train_data.set_index('RELI', drop=True, inplace=True)

# change the numerical labels to string in case of regression w.r.t. numerical relationship 
pd.set_option('mode.chained_assignment', None)
train_data[['LU1', 'LU2', 'LU3', 'LU4']] ='LU' + train_data[['LU1', 'LU2', 'LU3', 'LU4']].astype('str')
train_data[['LC1', 'LC2', 'LC3', 'LC4']] ='LC' + train_data[['LC1', 'LC2', 'LC3', 'LC4']].astype('str')
data_all[['LU1', 'LU2', 'LU3', 'LU4']] ='LU' + data_all[['LU1', 'LU2', 'LU3', 'LU4']].astype('str')
data_all[['LC1', 'LC2', 'LC3', 'LC4']] ='LC' + data_all[['LC1', 'LC2', 'LC3', 'LC4']].astype('str')

train_data.head()

Unnamed: 0_level_0,EAST,NORTH,LU4,LC4,LU3,LC3,LU2,LC2,LU1,LC1,training
RELI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
73012621,2730100,1262100,LU107,LC11,LU107,LC11,LU107,LC11,LU107,LC11,1
71932615,2719300,1261500,LU124,LC21,LU124,LC21,LU124,LC21,LU124,LC21,1
74942622,2749400,1262200,LU122,LC11,LU122,LC11,LU122,LC11,LU122,LC11,1
73102619,2731000,1261900,LU201,LC21,LU201,LC21,LU201,LC44,LU201,LC44,1
73342625,2733400,1262500,LU222,LC21,LU223,LC21,LU223,LC21,LU223,LC21,1


In [13]:
# init the dataframe
train_set = []
cols = ['RELI', 'EAST', 'NORTH', 'LU4', 'LC4', 'LU3', 'LC3', 'LU2', 'LC2', 'LU1', 'LC1',
       'nbr1_RELI', 'nbr1_EAST', 'nbr1_NORTH', 'nbr1_LU4',
       'nbr1_LC4', 'nbr1_LU3', 'nbr1_LC3', 'nbr1_LU2', 'nbr1_LC2', 'nbr1_LU1',
       'nbr1_LC1', 'nbr2_RELI', 'nbr2_EAST', 'nbr2_NORTH', 'nbr2_LU4',
       'nbr2_LC4', 'nbr2_LU3', 'nbr2_LC3', 'nbr2_LU2', 'nbr2_LC2', 'nbr2_LU1',
       'nbr2_LC1', 'nbr3_RELI', 'nbr3_EAST', 'nbr3_NORTH', 'nbr3_LU4',
       'nbr3_LC4', 'nbr3_LU3', 'nbr3_LC3', 'nbr3_LU2', 'nbr3_LC2', 'nbr3_LU1',
       'nbr3_LC1', 'nbr4_RELI', 'nbr4_EAST', 'nbr4_NORTH', 'nbr4_LU4',
       'nbr4_LC4', 'nbr4_LU3', 'nbr4_LC3', 'nbr4_LU2', 'nbr4_LC2', 'nbr4_LU1',
       'nbr4_LC1', 'nbr5_RELI', 'nbr5_EAST', 'nbr5_NORTH', 'nbr5_LU4',
       'nbr5_LC4', 'nbr5_LU3', 'nbr5_LC3', 'nbr5_LU2', 'nbr5_LC2', 'nbr5_LU1',
       'nbr5_LC1', 'nbr6_RELI', 'nbr6_EAST', 'nbr6_NORTH', 'nbr6_LU4',
       'nbr6_LC4', 'nbr6_LU3', 'nbr6_LC3', 'nbr6_LU2', 'nbr6_LC2', 'nbr6_LU1',
       'nbr6_LC1', 'nbr7_RELI', 'nbr7_EAST', 'nbr7_NORTH', 'nbr7_LU4',
       'nbr7_LC4', 'nbr7_LU3', 'nbr7_LC3', 'nbr7_LU2', 'nbr7_LC2', 'nbr7_LU1',
       'nbr7_LC1', 'nbr8_RELI', 'nbr8_EAST', 'nbr8_NORTH', 'nbr8_LU4',
       'nbr8_LC4', 'nbr8_LU3', 'nbr8_LC3', 'nbr8_LU2', 'nbr8_LC2', 'nbr8_LU1',
       'nbr8_LC1']

In [None]:
# iterate all the neighbours and but the training set
for idx, line in tqdm(train_data.iterrows()):
    # sanity check
    assert line['training'] == 1
    
    # loop to search for Eight neighbors of the tile 
    # neighbor 1
    neighbor_1 = data_all.query('RELI == {}'.format(idx + 1))
    if neighbor_1.empty:
        continue

    # neighbor 2
    neighbor_2 = data_all.query('RELI == {}'.format(idx - 1))
    if neighbor_2.empty:
        continue
    
    # neighbor 3
    neighbor_3 = data_all.query('RELI == {}'.format(idx + 10000))
    if neighbor_3.empty:
        continue
    
    # neighbor 4
    neighbor_4 = data_all.query('RELI == {}'.format(idx - 10000))
    if neighbor_4.empty:
        continue
    
    # neighbor 5
    neighbor_5 = data_all.query('RELI == {}'.format(idx + 1 + 10000))
    if neighbor_5.empty:
        continue
    
    # neighbor 6
    neighbor_6 = data_all.query('RELI == {}'.format(idx + 1 -10000))
    if neighbor_6.empty:
        continue
    
    # neighbor 7
    neighbor_7 = data_all.query('RELI == {}'.format(idx - 1 + 10000))
    if neighbor_7.empty:
        continue
    
    # neighbor 8
    neighbor_8 = data_all.query('RELI == {}'.format(idx - 1 - 10000))
    if neighbor_8.empty:
        continue
    
    new_line = [idx] + line.tolist() +neighbor_1.values.ravel().tolist() + neighbor_2.values.ravel().tolist() +\
                neighbor_3.values.ravel().tolist() + neighbor_4.values.ravel().tolist() + neighbor_5.values.ravel().tolist() +\
                neighbor_6.values.ravel().tolist() + neighbor_7.values.ravel().tolist() + neighbor_8.values.ravel().tolist()
    train_set.append(new_line)

60834it [1:27:21, 10.78it/s]

In [18]:
# create the dataframe and name the index
dataset = pd.DataFrame(train_set)
dataset.columns = cols
dataset.set_index('RELI', inplace=True)
dataset.head()

In [None]:
# save the dataset
dataset.to_csv('../data/all_data_with_neighbour.csv')

# Extract Cross & Square part

In [None]:
dataset = pd.
index_cross = dat