In [4]:
import warnings
warnings.filterwarnings('ignore')

# Import common GIS tools
import numpy as np
import xarray as xr
import pandas as pd


import matplotlib.pyplot as plt


# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, accuracy_score,classification_report,confusion_matrix


# Import Planetary Computer tools
import pystac_client
import planetary_computer as pc
import odc
from distributed import Client

client = Client(processes=False)
pc.settings.set_subscription_key('4346afa0eb8c4743b96d940704fda7d1')
from odc.stac import stac_load
import stackstac
#from odc.algo import to_rgba
from tqdm import tqdm
tqdm.pandas()

ModuleNotFoundError: No module named 'sklearn'

# Reading Raw GeoLoc Data

In [4]:
test_file = pd.read_csv('challenge_1_submission_template.csv')
test_file.head()
train_file = pd.read_csv('Crop_Location_Data.csv')
train_file.head()

Unnamed: 0,Latitude and Longitude,Class of Land
0,"(10.323727047081501, 105.2516346045924)",Rice
1,"(10.322364360592521, 105.27843410554115)",Rice
2,"(10.321455902933202, 105.25254306225168)",Rice
3,"(10.324181275911162, 105.25118037576274)",Rice
4,"(10.324635504740822, 105.27389181724476)",Rice


# Connecting to Microsoft Planetary Computer Hub

In [5]:
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=pc.sign_inplace,
)

# Functions for feature engineering
- These functions are responsible for retrieving the features necessary for training
- There is 2 satelite data sources available: Sentinel 1 and Sentinel 2, with each providing different kinds of data: Radar and Optical respectively
- Different Vegetation Indicies such as RVI and NDVI can be retrieved from these time-series data to act as features using for training
- After much experimenting and research, the **variance** of these vegetation indicies provides the best features for training

In [6]:
def get_sentinel1_data(latlong,time_slice,assets):
    '''
    Returns VV and VH values for a given latitude and longitude 
    Attributes:
    latlong - A tuple with 2 elements - latitude and longitude
    time_slice - Timeframe for which the VV and VH values have to be extracted
    assets - A list of bands to be extracted
    '''

    box_size_deg = 0.0004 # Surrounding box in degrees
    latlong=latlong.replace('(','').replace(')','').replace(' ','').split(',')
    
    latlong[0] , latlong[1] = float(latlong[0]), float(latlong[1])
    min_lon = latlong[1]-box_size_deg/2
    min_lat = latlong[0]-box_size_deg/2
    max_lon = latlong[1]+box_size_deg/2
    max_lat = latlong[0]+box_size_deg/2
    bounds = (min_lon, min_lat, max_lon, max_lat)

    search = catalog.search(collections=["sentinel-1-rtc"], bbox=bounds, datetime=time_slice)
    items = list(search.get_all_items())
    res = 10  # meters per pixel 
    scale = res / 111320.0 # degrees per pixel for crs=4326

    data = stac_load(items,bands=["vv", "vh"], patch_url=pc.sign, bbox=bounds, crs="EPSG:4326", resolution=scale,chunks={"x": 2048, "y": 2048})
    mean = data.mean(dim=['latitude','longitude']).compute()
    dop = (mean.vv / (mean.vv + mean.vh))
    m = 1 - dop
    rvi = (np.sqrt(dop))*((4*mean.vh)/(mean.vv + mean.vh))

    return rvi.var()

    

In [7]:
def get_sentinel2_data(latlong, time_slice):

    box_size_deg = 0.0009 # Surrounding box in degrees
    latlong=latlong.replace('(','').replace(')','').replace(' ','').split(',')
    
    latlong[0] , latlong[1] = float(latlong[0]), float(latlong[1])
    min_lon = latlong[1]-box_size_deg/2
    min_lat = latlong[0]-box_size_deg/2
    max_lon = latlong[1]+box_size_deg/2
    max_lat = latlong[0]+box_size_deg/2
    bounds = (min_lon, min_lat, max_lon, max_lat)
    

    searchh = catalog.search(collections=["sentinel-2-l2a"], bbox=bounds, datetime=time_slice)
    items = list(searchh.get_all_items())
    
    # Define the pixel resolution for the final product
    # Define the scale according to our selected crs, so we will use degrees
    res = 20  # meters per pixel 
    scale = res / 111320.0 # degrees per pixel for CRS:4326
    
    xx = stac_load(items, bands=["red", "nir", "SCL"], crs="EPSG:4326", resolution=scale, chunks={"x": 2048, "y": 2048}, dtype="uint16", patch_url=pc.sign, bbox=bounds )

    # Apply cloud mask ... NO Clouds, NO Cloud Shadows and NO Water pixels
    # All masked pixels are converted to "No Data" and stored as 16-bit integers
    mean_clean = cleaned_data.median(dim=['longitude','latitude']).compute()
    ndvi_mean_clean = (mean_clean.nir-mean_clean.red)/(mean_clean.nir+mean_clean.red)
    
    return ndvi_mean_clean.var()

In [9]:
def combine_two_datasets(dataset1,dataset2):
    '''
    Returns a  vertically concatenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined 
    dataset2 - Dataset 2 to be combined
    '''
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

# Reading feature data from previously downloaded data
- I have already used the functions to retrieve the features and stored them in a Pandas Dataframe exported to CSV
- This is so that I don't need to re download the satelite data

In [8]:
train_rvi_data_three_crop = pd.read_csv("train_rvi_data_three_crop.csv")
submission_rvi_data_three_crop = pd.read_csv("submission_rvi_data_three_crop.csv")
train_ndvi_data_three_crop = pd.read_csv("train_ndvi_data_three_crop_updated.csv")
submission_ndvi_data_three_crop = pd.read_csv("submission_ndvi_data_three_crop_updated.csv")

# Formatting of Features into dataframes for training

In [10]:
from sklearn.model_selection import cross_val_score
Y = []
for i in train_file["Class of Land"]:
    if i == 'Rice':
        Y.append(1)
    else:
        Y.append(0)
Y = pd.DataFrame(Y, columns = ['Class of Land'])
X = combine_two_datasets(train_rvi_data_three_crop, train_ndvi_data_three_crop)
X

Unnamed: 0,rvi,ndvi
0,0.087064,0.052746
1,0.103715,0.058047
2,0.098501,0.055300
3,0.083776,0.050447
4,0.091845,0.052428
...,...,...
595,0.017634,0.067580
596,0.016936,0.065668
597,0.015935,0.065318
598,0.011380,0.072052


# Training + Evaluation

In [11]:
from sklearn.linear_model import LogisticRegressionCV
sc = StandardScaler()
X = sc.fit_transform(X)
log = LogisticRegressionCV()
print(cross_val_score(log, X, Y, cv = 10).mean())

0.9799999999999999


# Testing (Submission)

In [12]:
Y = []
for i in train_file["Class of Land"]:
    if i == 'Rice':
        Y.append(1)
    else:
        Y.append(0)
Y = pd.DataFrame(Y, columns = ['Class of Land'])
X = combine_two_datasets(train_rvi_data_three_crop, train_ndvi_data_three_crop)
submission = combine_two_datasets(submission_rvi_data_three_crop, submission_ndvi_data_three_crop)
submission

Unnamed: 0,rvi,ndvi
0,0.109877,0.049697
1,0.062935,0.068655
2,0.083541,0.055181
3,0.030374,0.032616
4,0.094482,0.073626
...,...,...
245,0.045191,0.025755
246,0.031131,0.033803
247,0.029789,0.031764
248,0.102824,0.060775


In [13]:
from sklearn.linear_model import LogisticRegression
sc = StandardScaler()
X = sc.fit_transform(X)
submission = sc.transform(submission)
log = LogisticRegression()
model = log.fit(X, Y)

In [14]:
final_predictions = model.predict(submission)
final_prediction_series = pd.Series(final_predictions)

In [15]:
submission_df = pd.DataFrame({'id':test_file['id'].values, 'target':final_prediction_series.values})
submission_df.loc[submission_df['target'] == 1, 'target'] = 'Rice'
submission_df.loc[submission_df['target'] == 0, 'target'] = 'Non Rice'
(sum(submission_df['target'] == 'Rice'))/ len(submission_df['target'])

0.4

In [16]:
display(submission_df)

Unnamed: 0,id,target
0,"(10.18019073690894, 105.32022315786804)",Rice
1,"(10.561107033461816, 105.12772097986661)",Rice
2,"(10.623790611954897, 105.13771401411867)",Rice
3,"(10.583364246115156, 105.23946127195805)",Non Rice
4,"(10.20744446668854, 105.26844107128906)",Rice
...,...,...
245,"(10.308283266873062, 105.50872812216863)",Non Rice
246,"(10.582910017285496, 105.23991550078767)",Non Rice
247,"(10.581547330796518, 105.23991550078767)",Non Rice
248,"(10.629241357910818, 105.15315779432643)",Rice


In [25]:
submission_df.to_csv("submission_log_rvi_ndvi(unfiltered)_3crop.csv",index = False)