# Preprocessing 

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
# Load Datasets
crop_data = pd.read_csv('Crop Data.csv')
rainfall_data = pd.read_csv('Rainfall.csv')
nutrient_data = pd.read_csv('Crop Nutrient.csv')
price_data = pd.read_csv('Crop Price.csv')

In [3]:
crop_data.columns

Index(['State', 'District', 'Crop', 'Year', 'Season', 'Area', 'Production',
       'Yield', 'Crop_encoded'],
      dtype='object')

In [4]:
rainfall_data.columns

Index(['Year', 'State', 'District', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
       'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Annual'],
      dtype='object')

In [5]:
nutrient_data.columns

Index(['N', 'P', 'K', 'Temperature', 'Humidity', 'pH', 'Rainfall', 'Crop',
       'Crop_encoded'],
      dtype='object')

In [6]:
price_data.columns

Index(['Year', 'State', 'District', 'Rice', 'Wheat', 'Jowar', 'Pearl Millet',
       'Maize', 'Barley', 'Chickpea', 'Groundnut', 'Sesamum', 'Sugarcane',
       'Cotton'],
      dtype='object')

In [7]:
crop_data['Year'].unique()

array([1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 1997, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020], dtype=int64)

In [8]:
rainfall_data['Year'].unique()

array([2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2003], dtype=int64)

In [9]:
price_data['Year'].unique()

array([2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015], dtype=int64)

### Handling Missing Values

In [10]:
crop_data.fillna(crop_data.mean(numeric_only=True), inplace=True)
rainfall_data.fillna(rainfall_data.mean(numeric_only=True), inplace=True)
nutrient_data.fillna(nutrient_data.mean(numeric_only=True), inplace=True)
price_data.fillna(price_data.mean(numeric_only=True), inplace=True)

### Data Type Conversion

In [11]:
# Convert Production and Area to numeric
crop_data['Production'] = pd.to_numeric(crop_data['Production'], errors='coerce')
crop_data['Area'] = pd.to_numeric(crop_data['Area'], errors='coerce')

In [12]:
# Optional: Ensure Year is integer
crop_data['Year'] = crop_data['Year'].astype(int)
rainfall_data['Year'] = rainfall_data['Year'].astype(int)
price_data['Year'] = price_data['Year'].astype(int)

### Encoding Categorical Columns

In [13]:
le = LabelEncoder()
crop_data['Crop_encoded'] = le.fit_transform(crop_data['Crop'])
nutrient_data['Crop_encoded'] = le.fit_transform(nutrient_data['Crop'])

### Feature Scaling for Nutrient Data

In [14]:
scaler = MinMaxScaler()
nutrient_scaled = nutrient_data.copy()
nutrient_scaled[['N', 'P', 'K', 'pH', 'Temperature', 'Humidity']] = scaler.fit_transform(
    nutrient_data[['N', 'P', 'K', 'pH', 'Temperature', 'Humidity']]
)

### Save Cleaned Datasets 

In [15]:
crop_data.to_csv('Crop Data.csv', index=False)
rainfall_data.to_csv('Rainfall.csv', index=False)
nutrient_scaled.to_csv('Crop Nutrient.csv', index=False)
price_data.to_csv('Crop Price.csv', index=False)