In [1]:
# Importing the required modules

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
# Importing the dataset as a dataframe

In [4]:
soil_df = pd.read_csv('sentinel.csv')

In [5]:
soil_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,District,Mandal,Village,Latitude,Longitude,SoilType,pH,EC,...,B,Fe,Cu,Mn,Crop,ndvi,arvi,evi,gci,ndwi
0,0,0,Anantapur,Penukonda,Gonipeta,14.08,77.69,Mixed soil,6.19,0.07,...,0.17,8.89,0.51,15.24,G.Nut,0.312311,0.090286,0.53508,1.609966,-0.445978
1,1,1,Anantapur,Penukonda,Gonipeta,14.09,77.69,Redsoil,8.4,0.33,...,0.57,3.24,0.44,6.9,G.Nut,0.127815,-0.068606,0.187032,0.845618,-0.297165
2,2,2,Anantapur,Penukonda,Gonipeta,14.09,77.69,Mixed soil,7.1,0.11,...,0.19,5.54,0.42,8.34,G.Nut,0.127815,-0.068606,0.187032,0.845618,-0.297165
3,3,3,Anantapur,Penukonda,Gonipeta,14.1,77.7,Sandi soil,8.3,0.21,...,0.21,1.79,0.67,4.17,Horsegram,0.140598,-0.065553,0.1982,0.984709,-0.329918
4,4,4,Anantapur,Penukonda,Gonipeta,14.09,77.69,Mixed soil,6.4,0.06,...,0.22,22.26,0.45,9.2,Horsegram,0.127815,-0.068606,0.187032,0.845618,-0.297165


In [6]:
# Datatype of each column

In [7]:
soil_df.dtypes

Unnamed: 0        int64
Unnamed: 0.1      int64
District         object
Mandal           object
Village          object
Latitude        float64
Longitude       float64
SoilType         object
pH              float64
EC              float64
OC               object
P               float64
K                 int64
Ca                int64
Mg                int64
S                object
Zn              float64
B                object
Fe              float64
Cu              float64
Mn              float64
Crop             object
ndvi            float64
arvi            float64
evi             float64
gci             float64
ndwi            float64
dtype: object

In [8]:
soil_df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'District', 'Mandal', 'Village',
       'Latitude', 'Longitude', 'SoilType', 'pH', 'EC', 'OC', 'P', 'K', 'Ca',
       'Mg', 'S', 'Zn', 'B', 'Fe', 'Cu', 'Mn', 'Crop', 'ndvi', 'arvi', 'evi',
       'gci', 'ndwi'],
      dtype='object')

In [9]:
# Dropping the unimportant features

In [11]:
columns_to_be_dropped = ['Unnamed: 0', 'Unnamed: 0.1']
soil_df = soil_df.drop(columns_to_be_dropped, axis=1)

In [12]:
# Renaming some of the columns

In [13]:
soil_df.rename(columns={'ndvi':'NDVI', 'arvi':'ARVI', 'evi':'EVI', 'gci':'GCI', 'ndwi':'NDWI'}, inplace=True)

In [14]:
# Rearranging the columns

In [15]:
soil_df = soil_df[['Latitude', 'Longitude', 'District', 'Mandal', 'Village', 'SoilType', 'pH', 'EC', 'OC', 'P', 'K', 'Ca', 'Mg', 'S', 'Zn', 'B', 'Fe', 'Cu', 'Mn', 'NDVI', 'ARVI', 'EVI', 'GCI', 'NDWI', 'Crop']]

In [16]:
# Modifiying the datatypes of certain columns

In [17]:
soil_df['K'] = soil_df['K'].astype('float')
soil_df['Ca'] = soil_df['Ca'].astype('float')
soil_df['Mg'] = soil_df['Mg'].astype('float')
soil_df['S'] = pd.to_numeric(soil_df['S'], errors='coerce')
soil_df['B'] = pd.to_numeric(soil_df['B'], errors='coerce')
soil_df['OC'] = pd.to_numeric(soil_df['OC'], errors='coerce')

In [18]:
# Dropping rows with missing values

In [19]:
soil_df = soil_df.dropna()

In [20]:
# Columns of the dataframe after preprocessing

In [21]:
soil_df.dtypes

Latitude     float64
Longitude    float64
District      object
Mandal        object
Village       object
SoilType      object
pH           float64
EC           float64
OC           float64
P            float64
K            float64
Ca           float64
Mg           float64
S            float64
Zn           float64
B            float64
Fe           float64
Cu           float64
Mn           float64
NDVI         float64
ARVI         float64
EVI          float64
GCI          float64
NDWI         float64
Crop          object
dtype: object

In [22]:
# Descriptive statistics of the dataframe

In [23]:
soil_df.describe()

Unnamed: 0,Latitude,Longitude,pH,EC,OC,P,K,Ca,Mg,S,Zn,B,Fe,Cu,Mn,NDVI,ARVI,EVI,GCI,NDWI
count,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0,4564.0
mean,15.945184,80.530859,7.49718,0.504709,0.480874,16.822927,161.499343,2458.232033,535.847064,49.849908,1.126378,1.195449,29.038473,2.604652,18.791507,0.413328,0.233421,1.435249,1.910122,-0.440457
std,1.740843,2.031554,1.043035,1.067671,0.257423,18.610285,138.101414,2164.436861,479.390388,118.737146,1.399054,1.016308,39.265997,3.118959,19.139737,0.215426,0.209352,7.65898,1.230773,0.174658
min,12.81,75.28,4.15,0.01,0.01,0.45,12.0,46.0,4.0,0.1,0.03,0.03,0.04,0.03,0.01,-0.990654,-0.50911,-15.729614,-0.998182,-0.79616
25%,14.53,78.97,6.8,0.12,0.29,5.24,83.0,863.0,191.0,4.8075,0.42,0.48,5.79,0.94,8.1175,0.247253,0.062255,0.510231,1.120627,-0.546648
50%,15.56,80.09,7.82,0.23,0.44,11.11,130.0,1703.5,386.5,11.63,0.72,0.91,12.455,1.5,12.955,0.393643,0.197222,1.000337,1.638293,-0.450292
75%,17.73,82.5,8.2725,0.53,0.63,21.78,198.0,3217.75,739.0,36.73,1.27,1.62,35.07,2.7,21.9425,0.584499,0.395253,1.741673,2.411581,-0.359103
max,18.8,84.19,9.89,27.5,2.64,361.94,2409.0,13304.0,5363.0,2539.58,23.92,13.73,354.7,20.18,256.88,0.89392,0.806669,500.909091,7.811594,0.99637


In [24]:
# Saving the cleaned dataframe as a csv file called 'Cleaned_Soil.csv'

In [25]:
soil_df.to_csv('Cleaned_Soil.csv')

In [26]:
# List of Districts 

In [27]:
list(soil_df['District'].unique())

['Anantapur',
 'Chittoor',
 'East Godavari',
 'Guntur',
 'Kadapa',
 'Krishna',
 'Kurnool',
 'Nellore',
 'Prakasam',
 'Srikakulam',
 'Visakhapatnam',
 'Vizianagaram',
 'West Godavari']

In [28]:
soil_df.columns

Index(['Latitude', 'Longitude', 'District', 'Mandal', 'Village', 'SoilType',
       'pH', 'EC', 'OC', 'P', 'K', 'Ca', 'Mg', 'S', 'Zn', 'B', 'Fe', 'Cu',
       'Mn', 'NDVI', 'ARVI', 'EVI', 'GCI', 'NDWI', 'Crop'],
      dtype='object')