In [29]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=7)

In [10]:
def LookupTable(value,type='class'):
    try:
        classes = {
            'A00':[0,0],'A10':[0,1],'A11':[0,1],'A12':[0,1],'A13':[0,1],'A20':[0,2],
            'A21':[0,3],'A22':[0,1],'A30':[0,1],'B00':[3,0],'B10':[3,1],'B11':[3,1],
            'B12':[3,1],'B13':[3,1],'B14':[3,1],'B15':[3,1],'B16':[3,1],'B17':[3,1],
            'B18':[3,1],'B19':[3,1],'B20':[3,1],'B21':[3,1],'B22':[3,1],'B23':[3,1],
            'B30':[3,1],'B31':[3,1],'B32':[3,1],'B33':[3,1],'B34':[3,1],'B35':[3,1],
            'B36':[3,1],'B37':[3,1],'B40':[3,1],'B41':[3,1],'B42':[3,1],'B43':[3,1],
            'B44':[3,1],'B45':[3,1],'B50':[2,1],'B51':[2,1],'B52':[2,1],'B53':[2,1],
            'B54':[3,1],'B55':[2,1],'B70':[3,1],'B71':[3,1],'B72':[3,1],'B73':[3,1],
            'B74':[3,1],'B75':[3,1],'B76':[3,1],'B77':[3,1],'B80':[3,2],'B81':[3,1],
            'B82':[3,1],'B83':[0,1],'B84':[3,3],'C00':[0,0],'C10':[0,1],'C20':[0,1],
            'C21':[0,1],'C22':[0,1],'C23':[0,1],'C30':[0,1],'C31':[0,1],'C32':[0,1],
            'C33':[0,1],'D00':[1,0],'D10':[1,2],'D20':[1,2],'E00':[1,0],'E10':[1,2],
            'E20':[1,2],'E30':[0,1],'F00':[0,0],'F10':[0,1],'F20':[0,1],'F30':[0,3],
            'F40':[0,1],'G10':[0,1],'G11':[0,1],'G12':[0,1],'G20':[0,1],'G21':[0,1],
            'G22':[0,1],'G30':[0,1],'G50':[0,1],'H00':[1,0],'H10':[1,1],'H11':[1,1],
            'H12':[1,1],'H20':[0,2],'H21':[1,1],'H22':[0,1],'H23':[0,1]
        }
        mainClass = {
            '0':'Other',
            '1':'Natural and Semi-natural grass',
            '2':'Seeded grass',
            '3':'Crops and other related agricultural practices',
       }
        if type == 'class':
            result = mainClass[str(classes[value][0])]
        elif type == 'weight':
            result = classes[value][1]
    except:
        if type == 'class':
            result = 'Not classified'
        elif type == 'weight':
            result = -1
        
    return result

### Get data from LUCAS LULC data
#### Landuse: meaning the socioeconomic use of land (for instance, agriculture, forestry, recreation or residential use).
#### Landcover: for instance crops, grass, broad-leaved forest, or built-up are

In [11]:
originalData = '../datasets/03_LUCAS/lucas_harmo_uf.csv'
df = pd.read_csv(originalData,sep=',',low_memory=False)
df

Unnamed: 0,id,point_id,year,nuts0,nuts1,nuts2,nuts3,th_lat,th_long,office_pi,...,photo_east,photo_west,transect,revisit,th_gps_dist,file_path_gisco_north,file_path_gisco_south,file_path_gisco_east,file_path_gisco_west,file_path_gisco_point
0,1252839,26381958,2015,PT,PT1,PT17,,38.757771,-9.476111,,...,Photo taken,Photo taken,D20;61;51;61;C22;21;61,3,104.912224,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...
1,24355,26421958,2015,PT,PT1,PT17,,38.766957,-9.431407,1.0,...,Not relevant,Not relevant,PI;D20;61;D20;PI,1,,,,,,
2,42407,26441958,2015,PT,PT1,PT17,,38.771543,-9.409051,1.0,...,Not relevant,Not relevant,PI;C10;61;C10;61;C10;61;C10;61;C10;PI,2,,,,,,
3,1257765,26461768,2015,PT,PT1,PT15,,37.112502,-8.906504,,...,Photo taken,Photo taken,D10;61;D20,4,0.384871,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...
4,1249169,26461958,2015,PT,PT1,PT17,,38.776123,-9.386692,,...,Photo taken,Photo taken,PI;C10;PI;C10;61;C10;PI;C10;PI,1,95.260485,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...,https://gisco-services.ec.europa.eu/lucas/phot...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1351288,207428,45883932,2018,SE,SE2,SE23,SE232,58.412369,14.563829,1.0,...,Photo not taken,Photo not taken,,2,,,,,,
1351289,224760,31822336,2018,ES,ES1,ES13,ES130,43.168171,-4.060611,1.0,...,Photo not taken,Photo not taken,,1,,,,,,
1351290,242545,34041762,2018,ES,ES5,ES52,ES521,38.374810,-0.459298,1.0,...,Photo not taken,Photo not taken,,1,,,,,,
1351291,192650,49323404,2018,PL,PL6,PL61,PL616,53.394034,19.212378,1.0,...,Photo not taken,Photo not taken,,1,,,,,,


In [12]:
(round((df.isnull().sum()/df.shape[0])*100,2)).head(50)

id                       0.00
point_id                 0.00
year                     0.00
nuts0                    0.12
nuts1                    0.12
nuts2                    0.12
nuts3                   75.00
th_lat                   0.00
th_long                  0.00
office_pi               70.06
ex_ante                 75.00
survey_date              0.00
car_latitude            75.00
car_ew                  75.00
car_longitude           75.00
gps_proj                 0.00
gps_prec                 0.00
gps_altitude            12.46
gps_lat                  0.00
gps_ew                   0.00
gps_long                 0.00
obs_dist                 7.38
obs_direct               0.00
obs_type                 0.00
obs_radius              25.01
letter_group             0.00
lc1                      0.00
lc1_label                0.00
lc1_spec                12.46
lc1_spec_label          12.46
lc1_perc                12.46
lc2                      0.00
lc2_label               11.37
lc2_spec  

In [13]:
df.duplicated().sum()

0

In [14]:
len(df.lc1.unique())

81

In [15]:
df['gpw_lulc_class'] = df['lc1'].apply(lambda x: LookupTable(x,'class'))
df['sample_weight'] = df['lc1'].apply(lambda x: LookupTable(x,'weight'))

In [16]:
df[df.gpw_lulc_class == 'Not classified']['lc1'].unique()

array(['BX1', 'BX2', '8'], dtype=object)

In [17]:
df.gpw_lulc_class.value_counts()

gpw_lulc_class
Other                                             600920
Natural and Semi-natural grass                    365691
Crops and other related agricultural practices    344936
Seeded grass                                       25237
Not classified                                     14509
Name: count, dtype: int64

In [23]:
newdf = df[['th_long','th_lat','year','lc1_label','gpw_lulc_class','sample_weight']].copy()
newdf.rename(columns={
    'th_long':'longitude',
    'th_lat':'latitude',
    'year':'reference_year',
    'lc1_label':'original_lulc_class'},inplace=True)
newdf['dataset_name'] = 'Lucas'
newdf['observation'] = ''
newdf

Unnamed: 0,longitude,latitude,reference_year,original_lulc_class,gpw_lulc_class,sample_weight,dataset_name,observation
0,-9.476111,38.757771,2015,Shrubland without tree cover,Natural and Semi-natural grass,2,Lucas,
1,-9.431407,38.766957,2015,Shrubland without tree cover,Natural and Semi-natural grass,2,Lucas,
2,-9.409051,38.771543,2015,Broadleaved woodland,Other,1,Lucas,
3,-8.906504,37.112502,2015,Shrubland with sparse tree cover,Natural and Semi-natural grass,2,Lucas,
4,-9.386692,38.776123,2015,Broadleaved woodland,Other,1,Lucas,
...,...,...,...,...,...,...,...,...
1351288,14.563829,58.412369,2018,Inland fresh water bodies,Other,1,Lucas,
1351289,-4.060611,43.168171,2018,Non built-up linear features,Other,1,Lucas,
1351290,-0.459298,38.374810,2018,Non built-up linear features,Other,1,Lucas,
1351291,19.212378,53.394034,2018,Inland fresh water bodies,Other,1,Lucas,


In [24]:
newdf.describe()

Unnamed: 0,longitude,latitude,reference_year,sample_weight
count,1351293.0,1351293.0,1351293.0,1351293.0
mean,132.6705,244.7568,2012.987,1.251475
std,18206.95,28665.22,4.042502,0.542407
min,-10.55242,34.57025,2006.0,-1.0
25%,0.4294553,43.03737,2009.0,1.0
50%,11.24675,48.12544,2015.0,1.0
75%,18.86955,52.91932,2018.0,2.0
max,3514000.0,4932000.0,2018.0,3.0


In [30]:
newdf['observation'] = np.where(newdf.longitude >=1.5e+06,'Outliers',newdf['observation'])
newdf

Unnamed: 0,longitude,latitude,reference_year,original_lulc_class,gpw_lulc_class,sample_weight,dataset_name,observation
0,-9.476111,38.757771,2015,Shrubland without tree cover,Natural and Semi-natural grass,2,Lucas,
1,-9.431407,38.766957,2015,Shrubland without tree cover,Natural and Semi-natural grass,2,Lucas,
2,-9.409051,38.771543,2015,Broadleaved woodland,Other,1,Lucas,
3,-8.906504,37.112502,2015,Shrubland with sparse tree cover,Natural and Semi-natural grass,2,Lucas,
4,-9.386692,38.776123,2015,Broadleaved woodland,Other,1,Lucas,
...,...,...,...,...,...,...,...,...
1351288,14.563829,58.412369,2018,Inland fresh water bodies,Other,1,Lucas,
1351289,-4.060611,43.168171,2018,Non built-up linear features,Other,1,Lucas,
1351290,-0.459298,38.374810,2018,Non built-up linear features,Other,1,Lucas,
1351291,19.212378,53.394034,2018,Inland fresh water bodies,Other,1,Lucas,


In [32]:
newdf[newdf['observation'] != 'Outliers']

Unnamed: 0,longitude,latitude,reference_year,original_lulc_class,gpw_lulc_class,sample_weight,dataset_name,observation
0,-9.476111,38.757771,2015,Shrubland without tree cover,Natural and Semi-natural grass,2,Lucas,
1,-9.431407,38.766957,2015,Shrubland without tree cover,Natural and Semi-natural grass,2,Lucas,
2,-9.409051,38.771543,2015,Broadleaved woodland,Other,1,Lucas,
3,-8.906504,37.112502,2015,Shrubland with sparse tree cover,Natural and Semi-natural grass,2,Lucas,
4,-9.386692,38.776123,2015,Broadleaved woodland,Other,1,Lucas,
...,...,...,...,...,...,...,...,...
1351288,14.563829,58.412369,2018,Inland fresh water bodies,Other,1,Lucas,
1351289,-4.060611,43.168171,2018,Non built-up linear features,Other,1,Lucas,
1351290,-0.459298,38.374810,2018,Non built-up linear features,Other,1,Lucas,
1351291,19.212378,53.394034,2018,Inland fresh water bodies,Other,1,Lucas,


In [None]:
crs = "EPSG:4326"
newdf = gpd.GeoDataFrame(newdf, geometry=gpd.points_from_xy(newdf.longitude, newdf.latitude),crs=crs)
newdf.plot()

In [None]:
newdf.iloc[:,2:].head(5)

In [21]:
srcout = 
newdf.iloc[:,2:].to_parquet(srcout)

NameError: name 'srcout' is not defined