> In this notebook, wheat data is undersampled such that ratio of wheat/mustard is atleast 70:30. Train_val split and undersampling is performed by keeping the crop_name, sowing and harvest period combinations constant across the datasets (Stratification). 

# Import Modules and Data

In [9]:
from glob import glob
import geopandas as gp
import numpy as np
import pandas as pd
from copy import deepcopy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display
import random
import os

pd.options.display.max_rows = 100

import warnings
warnings.filterwarnings("ignore")

try:
    if kernel_is_loaded:
        pass
except:
    os.chdir('/'.join(os.getcwd().split('/')[:-1]))
    kernel_is_loaded = True

In [10]:
# Importing all the required files
wheat_train_val = pd.read_csv(r'data_files/data_share/preprocessed_wheat.csv')
mustard_train_val = pd.read_csv(r'data_files/data_share/preprocessed_mustard.csv')

wheat_test = pd.read_csv(r'data_files/data_share/preprocessed_new_wheat.csv')
mustard_test = pd.read_csv(r'data_files/data_share/preprocessed_new_mustard.csv')

# Dropping the unnecessary features as of now
wheat_train_val = wheat_train_val.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)
mustard_train_val = mustard_train_val.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)
mustard_test = mustard_test.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)
wheat_test = wheat_test.drop(['latitude', 'longitude', 'state_name', 'district', 'taluka_name', 'sowing_year', 'harvest_year'], axis=1)

# Undersampling - (70:30)

In [11]:
# Not using the combinations, where the count is just 1

display(pd.crosstab(wheat_train_val['sowing_period'], wheat_train_val['harvest_period'], margins=True))
display(pd.crosstab(mustard_train_val['sowing_period'], mustard_train_val['harvest_period'], margins=True))

harvest_period,feb_1f,feb_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dec_1f,0,0,1987,3103,5090
dec_2f,0,0,19,169,188
nov_1f,108,1402,1925,181,3616
nov_2f,8,1056,4435,1750,7249
oct_2f,55,67,28,14,164
All,171,2525,8394,5217,16307


harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nov_1f,28,288,0,376,18,710
nov_2f,1,2,0,13,1,17
oct_2f,146,319,10,60,4,539
All,175,609,10,449,23,1266


In [12]:
# Dropping combinations with value_Counts as 1

# wheat_train_val = wheat_train_val[(wheat_train_val.sowing_period != 'nov_1f') | (wheat_train_val.harvest_period != 'jan_2f')]
# wheat_train_val = wheat_train_val[(wheat_train_val.sowing_period != 'oct_2f') | (wheat_train_val.harvest_period != 'jan_2f')]                                 

mustard_train_val = mustard_train_val[(mustard_train_val.sowing_period != 'nov_2f') | (mustard_train_val.harvest_period != 'mar_2f')]
mustard_train_val = mustard_train_val[(mustard_train_val.sowing_period != 'nov_2f') | (mustard_train_val.harvest_period != 'feb_1f')]          


In [13]:
# Combinations dropped

display(pd.crosstab(wheat_train_val['sowing_period'], wheat_train_val['harvest_period'], margins=True))
display(pd.crosstab(mustard_train_val['sowing_period'], mustard_train_val['harvest_period'], margins=True))

harvest_period,feb_1f,feb_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dec_1f,0,0,1987,3103,5090
dec_2f,0,0,19,169,188
nov_1f,108,1402,1925,181,3616
nov_2f,8,1056,4435,1750,7249
oct_2f,55,67,28,14,164
All,171,2525,8394,5217,16307


harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nov_1f,28,288,0,376,18,710
nov_2f,0,2,0,13,0,15
oct_2f,146,319,10,60,4,539
All,174,609,10,449,22,1264


In [14]:
target_wheat = int(mustard_train_val.shape[0] * 0.7/0.3)

split_per = target_wheat/wheat_train_val.shape[0]

_, wheat_train_val = train_test_split(wheat_train_val, test_size=split_per, 
                            stratify=wheat_train_val[['sowing_period', 'harvest_period']], random_state=0)

display(wheat_train_val.shape[0]/mustard_train_val.shape[0], 0.7/0.3)

# Creating train_val and test data sets
train_val = pd.concat([wheat_train_val, mustard_train_val], ignore_index=True)
test = pd.concat([wheat_test, mustard_test], ignore_index=True)

2.3330696202531644

2.3333333333333335

# Train_val Split

In [15]:
# Not using the combinations, where the count is just 1

display(pd.crosstab(train_val['sowing_period'], train_val['harvest_period'], margins=True))

harvest_period,feb_1f,feb_2f,jan_2f,mar_1f,mar_2f,All
sowing_period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dec_1f,0,0,0,359,561,920
dec_2f,0,0,0,3,31,34
nov_1f,48,542,0,724,51,1365
nov_2f,1,193,0,815,316,1325
oct_2f,156,331,10,65,7,569
All,205,1066,10,1966,966,4213


In [16]:
train_val = train_val[(train_val.sowing_period != 'nov_2f') | (train_val.harvest_period != 'feb_1f')]

In [17]:
train, val = train_test_split(train_val, test_size=0.2, 
                                   stratify=train_val[['crop_name', 'sowing_period', 'harvest_period']], random_state=0)

display(train.shape, val.shape, train.crop_name.value_counts(normalize=True), val.crop_name.value_counts(normalize=True))

(3369, 15)

(843, 15)

crop_name
Wheat      0.700208
Mustard    0.299792
Name: proportion, dtype: float64

crop_name
Wheat      0.698695
Mustard    0.301305
Name: proportion, dtype: float64

# Label Encoding

In [18]:
for df in train, val, test:
    df['crop_name'] = df.crop_name.apply(lambda crop: 1 if crop == 'Wheat' else 0)
    df.drop(['sowing_period', 'harvest_period'], axis=1, inplace=True)
    
train.head(3)

Unnamed: 0,oct_2f,nov_1f,nov_2f,dec_1f,dec_2f,jan_1f,jan_2f,feb_1f,feb_2f,mar_1f,mar_2f,apr_1f,crop_name
1904,142,125,130,139,175,182.0,186,188,188,184,155.0,122,1
1782,153,127,117,133,161,176.0,180,188,192,185,137.0,115,1
3543,134,151,180,188,187,185.0,195,175,137,120,118.0,114,0


# File Export

In [19]:
train.to_csv(r'data_files/data_share/train-4.csv', index=False)
test.to_csv(r'data_files/data_share/test-4.csv', index=False)
val.to_csv(r'data_files/data_share/val-4.csv', index=False)

In [20]:
train.crop_name.value_counts(), val.crop_name.value_counts(), test.crop_name.value_counts()

(crop_name
 1    2359
 0    1010
 Name: count, dtype: int64,
 crop_name
 1    589
 0    254
 Name: count, dtype: int64,
 crop_name
 1    5760
 0     708
 Name: count, dtype: int64)

In [21]:
train.shape[0], test.shape[0], val.shape[0]

(3369, 6468, 843)