In [1]:
# Importing the required modules

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
# Importing the dataset as a dataframe

In [4]:
soil_df = pd.read_csv('Soil.csv')

In [5]:
soil_df.head()

Unnamed: 0,Sl no,Date,Farmer No,Macro/ Micro nutrient,Farmer Name,District,Mandal,Village,Latitude,Longitude,...,Exch-K,Avail-Ca,Avail-Mg,Avail-S,Avail-Zn,Avail-B,Avail-Fe,Avail-Cu,Avail-Mn,Time
0,1,01-01-2015,1910,RK2276,P.Krishna Naik,Anantapur,Penukonda,Gonipeta,14.08,77.69,...,41,587,101,5.16,0.3,0.17,8.89,0.51,15.24,01-01-2015
1,2,01-01-2015,1911,RK2277,Kallu Thippe Naik,Anantapur,Penukonda,Gonipeta,14.09,77.69,...,102,811,261,9.91,0.36,0.57,3.24,0.44,6.9,01-01-2015
2,3,01-01-2015,1912,RK2278,P.Duble Bai,Anantapur,Penukonda,Gonipeta,14.09,77.69,...,46,582,48,3.77,0.37,0.19,5.54,0.42,8.34,01-01-2015
3,4,01-01-2015,1913,RK2279,H.Marekka (Kamma),Anantapur,Penukonda,Gonipeta,14.1,77.7,...,35,3048,52,4.14,0.23,0.21,1.79,0.67,4.17,01-01-2015
4,5,01-01-2015,1914,RK2280,M.Alevelamma,Anantapur,Penukonda,Gonipeta,14.09,77.69,...,76,511,84,1.45,0.36,0.22,22.26,0.45,9.2,01-01-2015


In [6]:
# Datatype of each column

In [7]:
soil_df.dtypes

Sl no                      int64
Date                      object
Farmer No                 object
Macro/ Micro nutrient     object
Farmer Name               object
District                  object
Mandal                    object
Village                   object
Latitude                 float64
Longitude                float64
Survey No.                object
Soil type                 object
Fathers Name              object
Extent\n(AC)              object
Crop before               object
pH                       float64
EC                       float64
OC                        object
Avail-P                  float64
Exch-K                     int64
Avail-Ca                   int64
Avail-Mg                   int64
Avail-S                   object
Avail-Zn                 float64
Avail-B                   object
Avail-Fe                 float64
Avail-Cu                 float64
Avail-Mn                 float64
Time                      object
dtype: object

In [8]:
# Dropping the unimportant features

In [9]:
columns_to_be_dropped = ['Sl no', 'Date', 'Farmer No', 'Macro/ Micro nutrient', 'Farmer Name', 'Survey No.', 'Fathers Name', 'Extent\n(AC)', 'Time']
soil_df = soil_df.drop(columns_to_be_dropped, axis=1)

In [10]:
# Renaming some of the columns

In [11]:
soil_df.rename(columns={'Crop before':'Crop', 'Soil type':'SoilType', 'Avail-P':'P'}, inplace=True)
soil_df.rename(columns={'Exch-K':'K', 'Avail-Ca':'Ca', 'Avail-Mg':'Mg'}, inplace=True)
soil_df.rename(columns={'Avail-S':'S', 'Avail-Zn':'Zn', 'Avail-B':'B'}, inplace=True)
soil_df.rename(columns={'Avail-Fe':'Fe', 'Avail-Cu':'Cu', 'Avail-Mn':'Mn'}, inplace=True)

In [12]:
# Rearranging the columns

In [13]:
soil_df = soil_df[['Latitude', 'Longitude', 'District', 'Mandal', 'Village', 'SoilType', 'pH', 'EC', 'OC', 'P', 'K', 'Ca', 'Mg', 'S', 'Zn', 'B', 'Fe', 'Cu', 'Mn', 'Crop']]

In [14]:
# Modifiying the datatypes of certain columns

In [15]:
soil_df['K'] = soil_df['K'].astype('float')
soil_df['Ca'] = soil_df['Ca'].astype('float')
soil_df['Mg'] = soil_df['Mg'].astype('float')
soil_df['S'] = pd.to_numeric(soil_df['S'], errors='coerce')
soil_df['B'] = pd.to_numeric(soil_df['B'], errors='coerce')
soil_df['OC'] = pd.to_numeric(soil_df['OC'], errors='coerce')

In [16]:
# Dropping rows with missing values

In [17]:
soil_df = soil_df.dropna()

In [18]:
# Columns of the dataframe after preprocessing

In [19]:
soil_df.dtypes

Latitude     float64
Longitude    float64
District      object
Mandal        object
Village       object
SoilType      object
pH           float64
EC           float64
OC           float64
P            float64
K            float64
Ca           float64
Mg           float64
S            float64
Zn           float64
B            float64
Fe           float64
Cu           float64
Mn           float64
Crop          object
dtype: object

In [20]:
# Descriptive statistics of the dataframe

In [21]:
soil_df.describe()

Unnamed: 0,Latitude,Longitude,pH,EC,OC,P,K,Ca,Mg,S,Zn,B,Fe,Cu,Mn
count,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0,4568.0
mean,15.944831,80.529853,7.497154,0.504538,0.480779,16.812774,161.487084,2459.675569,535.914186,49.827496,1.125858,1.195267,29.033584,2.60354,18.789297
std,1.740956,2.032334,1.043438,1.067247,0.257344,18.605805,138.070049,2166.450116,479.291102,118.691247,1.398581,1.016041,39.255834,3.117829,19.132041
min,12.81,75.28,4.15,0.01,0.01,0.45,12.0,46.0,4.0,0.1,0.03,0.03,0.04,0.03,0.01
25%,14.53,78.97,6.8,0.12,0.29,5.23,83.0,863.0,191.0,4.8075,0.42,0.48,5.79,0.94,8.135
50%,15.555,80.09,7.82,0.23,0.44,11.11,130.0,1703.5,387.0,11.625,0.72,0.91,12.455,1.5,12.96
75%,17.73,82.5,8.28,0.53,0.63,21.765,198.0,3226.5,739.25,36.73,1.27,1.62,35.07,2.6925,21.9425
max,18.8,84.19,9.89,27.5,2.64,361.94,2409.0,13304.0,5363.0,2539.58,23.92,13.73,354.7,20.18,256.88


In [22]:
# Saving the cleaned dataframe as a csv file called 'Cleaned_Soil.csv'

In [23]:
soil_df.to_csv('Cleaned_Soil.csv')

In [24]:
# List of Districts 

In [25]:
list(soil_df['District'].unique())

['Anantapur',
 'Chittoor',
 'East Godavari',
 'Guntur',
 'Kadapa',
 'Krishna',
 'Kurnool',
 'Nellore',
 'Prakasam',
 'Srikakulam',
 'Visakhapatnam',
 'Vizianagaram',
 'West Godavari']

In [26]:
soil_df.columns

Index(['Latitude', 'Longitude', 'District', 'Mandal', 'Village', 'SoilType',
       'pH', 'EC', 'OC', 'P', 'K', 'Ca', 'Mg', 'S', 'Zn', 'B', 'Fe', 'Cu',
       'Mn', 'Crop'],
      dtype='object')