In [None]:
import sys
!conda install -c conda-forge --yes --prefix {sys.prefix} tsfresh

In [None]:
!pip install soiltexture

In [12]:
from glob import glob
import pandas as pd
import geopandas as gpd
import numpy as np
import soiltexture

In [44]:
filepath_ts = r'C:/Users/USER/Desktop/Master_Irrigation/03_GIS/soil_classification/download_ts/grassland/'
files = glob(filepath_ts + '*')

In [45]:
# Load all files into one Dataframe
gdf = pd.concat([gpd.read_file(x, ignore_index=True) for x in files])

# Set datetime type
gdf['date'] = gdf['date'].astype('datetime64[ns]')

In [46]:
# Load LUCAS TOPSOIL DATABASE
df_lts = pd.read_csv(r'C:/Users/USER/Desktop/Master_Irrigation/03_GIS/soil_classification/LTS_grassland.csv').drop(['Unnamed: 0', 'geometry'], axis=1)
gdf_lts = gpd.GeoDataFrame(df_lts, geometry=gpd.points_from_xy(df_lts.GPS_LONG, df_lts.GPS_LAT))

# Join LTS data to S1, S2, Era5 Data
gdf = pd.merge(gdf, gdf_lts, on='POINT_ID', how='left', suffixes=('', '_y')).rename({'geometry_y' : 'geometry'})
print(len(gdf))

268300


In [47]:
# Mask rows without particel size distribution
gdf = gdf[(gdf.sand > 0) & (gdf.silt > 0) & (gdf.clay > 0)]
print(len(gdf))

257511


In [48]:
# Create USDA classification
USDA_class = list()
FAO_class = list()
INTERNATIONAL_class = list()
ISSS_class = list()
for index, row in gdf.iterrows():
    USDA_class.append(soiltexture.getTexture(row.sand, row.clay, classification='USDA'))
    FAO_class.append(soiltexture.getTexture(row.sand, row.clay, classification='FAO'))
    INTERNATIONAL_class.append(soiltexture.getTexture(row.sand, row.clay, classification='INTERNATIONAL'))
    ISSS_class.append(soiltexture.getTexture(row.sand, row.clay, classification='ISSS'))
                      
gdf['USDA'] = USDA_class
gdf['FAO'] = FAO_class
gdf['INTERNATIONAL'] = INTERNATIONAL_class
gdf['ISSS'] = ISSS_class
print(len(gdf))

257511


In [49]:
to_numerical = {'FAO':     {'medium': int(0), 'coarse': int(1), 'fine' : int(2)}}
gdf['FAO_nr'] = gdf.replace(to_numerical)['FAO']
gdf['FAO_nr'] = gdf['FAO_nr']
print(len(gdf))

257511


In [56]:
# Subset Data
#gdf_subset_1 = gdf.loc[:,['POINT_ID', 'VV','VH', 'orbit', 'platform', 'NDVI','day_of_year', 'FAO_nr', 
#                          'dewpoint_2m_temperature', 'maximum_2m_air_temperature', 'mean_2m_air_temperature',
#                          'minimum_2m_air_temperature', 'surface_pressure', 'total_precipitation', 
#                          'u_component_of_wind_10m', 'v_component_of_wind_10m']]

# Drop rows (axis=0) with na values
gdf.dropna(subset=['FAO_nr'], axis=0, inplace=True)

# One Hot Encoding
gdf['FAO_nr'] = gdf['FAO_nr'].astype('int64')
#df_sub = pd.DataFrame(pd.get_dummies(gdf_subset_1, columns=['platform', 'orbit', 'FAO_nr']))
print(len(gdf))

253790


In [36]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [57]:
gdf_fe = gdf.loc[:,['POINT_ID', 'date','VV', 'VH', 'angle','Aerosols', 'Blue', 'Green', 'Red', 'RedEdge1', 'RedEdge2', 'RedEdge3',
       'RedEdge4', 'NIR', 'WaterVapor', 'Cirrus', 'SWIR1', 'SWIR2','dewpoint_2m_temperature',
       'maximum_2m_air_temperature', 'mean_2m_air_temperature',
       'minimum_2m_air_temperature', 'surface_pressure', 'total_precipitation',
       'u_component_of_wind_10m', 'v_component_of_wind_10m', 'NDVI']]

In [58]:
gdf_fe.dropna(inplace=True)

In [59]:
gdf_fe = gdf_fe[gdf_fe['date'].dt.year.isin([2018])]

In [None]:
X_filtered_2 = extract_relevant_features(df, y, column_id='id', column_sort='time',
                                         default_fc_parameters=extraction_settings)

In [60]:
extraction_settings = ComprehensiveFCParameters()

X = extract_features(gdf_fe, column_id='POINT_ID', column_sort='date',
                     default_fc_parameters=extraction_settings,
                     # we impute = remove all NaN features automatically
                     impute_function=impute)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 10/10 [06:29<00:00, 38.90s/it]


In [63]:
y = gdf.groupby('POINT_ID')['FAO_nr'].first()
y

POINT_ID
50243144    1
50243176    1
50243222    1
50243402    0
50243682    0
           ..
53983768    0
54121890    0
54221858    0
54782100    0
54862104    0
Name: FAO_nr, Length: 318, dtype: int64

In [65]:
X_filtered = select_features(X, y)

In [67]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [68]:
X_full_train, X_full_test, y_train, y_test = train_test_split(X, y, test_size=.4)
X_filtered_train, X_filtered_test = X_full_train[X_filtered.columns], X_full_test[X_filtered.columns]

In [69]:
classifier_full = DecisionTreeClassifier()
classifier_full.fit(X_full_train, y_train)
print(classification_report(y_test, classifier_full.predict(X_full_test)))

              precision    recall  f1-score   support

           0       0.66      0.57      0.61        80
           1       0.48      0.48      0.48        44
           2       0.07      0.25      0.11         4

    accuracy                           0.53       128
   macro avg       0.40      0.43      0.40       128
weighted avg       0.58      0.53      0.55       128



In [70]:
classifier_filtered = DecisionTreeClassifier()
classifier_filtered.fit(X_filtered_train, y_train)
print(classification_report(y_test, classifier_filtered.predict(X_filtered_test)))

              precision    recall  f1-score   support

           0       0.70      0.57      0.63        80
           1       0.53      0.55      0.54        44
           2       0.00      0.00      0.00         4

    accuracy                           0.55       128
   macro avg       0.41      0.37      0.39       128
weighted avg       0.62      0.55      0.58       128



https://github.com/blue-yonder/tsfresh/blob/main/notebooks/examples/01%20Feature%20Extraction%20and%20Selection.ipynb