In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
print("NOTEBOOK 04: FEATURE ENGINEERING")

NOTEBOOK 04: FEATURE ENGINEERING


In [3]:
df=pd.read_csv("../data/processed/crop_data_cleaned.csv")

In [9]:
print("\n Data Loaded Successfully!")
print(f"Shape:{df.shape}")
print(f"\n Original Features: {list(df.columns)}")
print("\nFirst 5 Rows")
print(df.head())


 Data Loaded Successfully!
Shape:(2200, 8)

 Original Features: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']

First 5 Rows
    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice


In [10]:
original_features=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
target = 'label'

print(f"\n Original Features: {len(original_features)}")
print(f"Target: {target}")


 Original Features: 7
Target: label


# Feature Engineering Strategy

In [11]:
print("FEATURE ENGINEERING STRATEGY")

FEATURE ENGINEERING STRATEGY


In [12]:
engineering_plan = """
 FEATURE ENGINEERING PLAN:

1. NUTRIENT RATIOS (NPK Interactions)
   • N_to_P_ratio: Nitrogen to Phosphorus ratio
   • N_to_K_ratio: Nitrogen to Potassium ratio
   • P_to_K_ratio: Phosphorus to Potassium ratio
   • NPK_sum: Total nutrient content
   • NPK_product: Multiplicative interaction

2. CLIMATE FEATURES
   • temp_humidity_interaction: Temperature × Humidity
   • climate_index: Combined climate factor
   • temp_range_category: Temperature zones (Cool/Moderate/Hot)
   • humidity_category: Humidity levels (Low/Medium/High)

3. SOIL FEATURES
   • ph_category: pH classification (Acidic/Neutral/Alkaline)
   • ph_squared: Non-linear pH effects
   • nutrient_ph_interaction: NPK × pH interactions

4. WATER FEATURES
   • rainfall_category: Rainfall zones (Low/Medium/High)
   • water_stress_index: Temperature/Rainfall ratio
   • moisture_index: Rainfall × Humidity

5. COMPOSITE INDICES
   • growing_condition_index: Overall suitability score
   • resource_availability: Combined N+P+K+Rainfall
   • environmental_stress: Temperature + low humidity

Total New Features: ~20
Total Features After Engineering: ~27
"""

print(engineering_plan)


 FEATURE ENGINEERING PLAN:

1. NUTRIENT RATIOS (NPK Interactions)
   • N_to_P_ratio: Nitrogen to Phosphorus ratio
   • N_to_K_ratio: Nitrogen to Potassium ratio
   • P_to_K_ratio: Phosphorus to Potassium ratio
   • NPK_sum: Total nutrient content
   • NPK_product: Multiplicative interaction

2. CLIMATE FEATURES
   • temp_humidity_interaction: Temperature × Humidity
   • climate_index: Combined climate factor
   • temp_range_category: Temperature zones (Cool/Moderate/Hot)
   • humidity_category: Humidity levels (Low/Medium/High)

3. SOIL FEATURES
   • ph_category: pH classification (Acidic/Neutral/Alkaline)
   • ph_squared: Non-linear pH effects
   • nutrient_ph_interaction: NPK × pH interactions

4. WATER FEATURES
   • rainfall_category: Rainfall zones (Low/Medium/High)
   • water_stress_index: Temperature/Rainfall ratio
   • moisture_index: Rainfall × Humidity

5. COMPOSITE INDICES
   • growing_condition_index: Overall suitability score
   • resource_availability: Combined N+P+K+Rain

# Create NPK Ratio Features

In [13]:
print("FEATURE ENGINEERING 1: NPK RATIOS")

FEATURE ENGINEERING 1: NPK RATIOS


In [14]:
df_fe=df.copy()

In [15]:
#NPK ratios (handle division by zero)
df_fe["N_to_P_ratio"]=df_fe["N"]/(df_fe["P"]+1e-5) #Add small constant to avoid division by zero
df_fe["N_to_K_ratio"]=df_fe["N"]/(df_fe["K"]+1e-5)
df_fe["P_to_K_ratio"]=df_fe["P"]/(df_fe["K"]+1e-5)

df_fe["NPK_sum"]=df_fe["N"]+df_fe["P"]+df_fe["K"]
df_fe["NPK_product"]=df_fe["N"]*df_fe["P"]*df_fe["K"]

df_fe["N_dominance"]=df_fe["N"]/(df_fe["NPK_sum"]+1e-5)
df_fe["P_dominance"]=df_fe["P"]/(df_fe["NPK_sum"]+1e-5)
df_fe["K_dominance"]=df_fe["K"]/(df_fe["NPK_sum"]+1e-5)

print("\n Created 8 NPK ratio features.")
npk_features=["N_to_P_ratio","N_to_K_ratio","P_to_K_ratio","NPK_sum","NPK_product","N_dominance","P_dominance","K_dominance"]
for feat in npk_features:
    print(f"  {feat}: Range [{df_fe[feat].min():.2f}, {df_fe[feat].max():.2f}]")


 Created 8 NPK ratio features.
  N_to_P_ratio: Range [0.00, 23.80]
  N_to_K_ratio: Range [0.00, 9.33]
  P_to_K_ratio: Range [0.09, 6.00]
  NPK_sum: Range [17.00, 385.00]
  NPK_product: Range [0.00, 1149720.00]
  N_dominance: Range [0.00, 0.74]
  P_dominance: Range [0.03, 0.82]
  K_dominance: Range [0.07, 0.88]


In [None]:
df_fe

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,N_to_P_ratio,N_to_K_ratio,P_to_K_ratio,NPK_sum,NPK_product,N_dominance,P_dominance,K_dominance
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice,2.142857,2.093023,0.976744,175,162540,0.514286,0.240000,0.245714
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice,1.465517,2.073170,1.414634,184,202130,0.461956,0.315217,0.222826
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice,1.090909,1.363636,1.250000,159,145200,0.377358,0.345912,0.276730
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice,2.114285,1.850000,0.875000,149,103600,0.496644,0.234899,0.268456
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice,1.857142,1.857142,1.000000,162,137592,0.481481,0.259259,0.259259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,coffee,3.147058,3.343749,1.062500,173,116416,0.618497,0.196532,0.184971
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,coffee,6.599996,3.666665,0.555555,141,40095,0.702128,0.106383,0.191489
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,coffee,3.575756,3.933332,1.100000,181,116820,0.651934,0.182320,0.165746
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,coffee,3.656249,3.441175,0.941176,183,127296,0.639344,0.174863,0.185792


#  Create Climate Interaction Features

In [18]:
print("fEATURE ENGINEERING 2: CLIMATE FEATURES")

fEATURE ENGINEERING 2: CLIMATE FEATURES


In [19]:
df_fe["temp_humidity_interaction"]=df_fe["temperature"]*df_fe["humidity"]

In [20]:
df_fe["climate_index"]=(df_fe["temperature"]/50)*(df_fe["humidity"]/100)

In [None]:
def categorize_temperature(temp): # Temperature Categories
    if temp<15:
        return "Cool"
    elif temp<30:
        return "Moderate"   
    else:
        return "Hot"

df_fe["temp_category"]=df_fe["temperature"].apply(categorize_temperature)

In [22]:
def categorize_humidity(hum): # Humidity Categories
    if hum<40:
        return "Low"
    elif hum<70:
        return "Medium"
    else:
        return "High"

df_fe["humidity_category"]=df_fe["humidity"].apply(categorize_humidity)

In [24]:
# Heat Stress Index
df_fe["heat_stress_index"]=df_fe["temperature"]/ (df_fe["humidity"]+1)

In [26]:
print("\n Created 5 climate_based features.")
climate_features=["temp_humidity_interaction","climate_index","temp_category","humidity_category","heat_stress_index"]

for feat in climate_features:
    if df_fe[feat].dtype == "object":
        print(f" {feat}: Categories {df_fe[feat].unique()}")
    else:
        print(f" {feat}: Range [{df_fe[feat].min():.2f}, {df_fe[feat].max():.2f}]")


 Created 5 climate_based features.
 temp_humidity_interaction: Range [247.61, 4073.16]
 climate_index: Range [0.05, 0.81]
 temp_category: Categories ['Moderate' 'Hot' 'Cool']
 humidity_category: Categories ['High' 'Medium' 'Low']
 heat_stress_index: Range [0.11, 1.36]


# Create Soil pH Features

In [30]:
print("FEATURE ENGINEERING 3: SOIL pH FEATURES")

FEATURE ENGINEERING 3: SOIL pH FEATURES


In [31]:
def categorize_ph(ph):
    if ph<5.5:
        return "Acidic"
    elif ph<7.5:
        return "Neutral"
    else:
        return "Alkaline"

df_fe["ph_category"]=df_fe["ph"].apply(categorize_ph)

df_fe["ph_squared"]=df_fe["ph"]**2

df_fe["ph_deviation"]=abs(df_fe["ph"]-7.0)

df_fe["N_ph_interaction"]=df_fe["N"] * df_fe["ph"]
df_fe["P_ph_interaction"]=df_fe["P"] * df_fe["ph"]
df_fe["K_ph_interaction"]=df_fe["K"] * df_fe["ph"]

print("\n Created 6 soil pH based features.")
ph_features=["ph_category","ph_squared","ph_deviation","N_ph_interaction","P_ph_interaction","K_ph_interaction"]

for feat in ph_features:
    if df_fe[feat].dtype == "object":
        print(f" {feat}: Categories {df_fe[feat].unique()}")
    else:
        print(f" {feat}: Range [{df_fe[feat].min():.2f}, {df_fe[feat].max():.2f}]")


 Created 6 soil pH based features.
 ph_category: Categories ['Neutral' 'Alkaline' 'Acidic']
 ph_squared: Range [12.28, 98.71]
 ph_deviation: Range [0.00, 3.50]
 N_ph_interaction: Range [0.00, 1071.29]
 P_ph_interaction: Range [27.76, 940.23]
 K_ph_interaction: Range [32.81, 1325.92]


# Create Rainfall/Water Features

In [34]:
print("FEaTURE ENGINEERING 4: RAINFALL/WATER FEATURES")

FEaTURE ENGINEERING 4: RAINFALL/WATER FEATURES


In [37]:
def categorize_rainfall(rain):
    if rain<50:
        return "Low"
    elif rain<150:
        return "Medium"
    else:
        return "High"

df_fe["rainfall_category"]=df_fe["rainfall"].apply(categorize_rainfall)

# Water Stress Index (high temp, low rainfall)
df_fe["water_stress_index"]=df_fe["temperature"]/(df_fe["rainfall"]+1)

#Moisture Index (rainfall * humidity)
df_fe["moisture_index"]=df_fe["rainfall"] * (df_fe["humidity"]/100)

#Rainfall per degree (water efficiency)
df_fe["rainfall_per_temp"]=df_fe["rainfall"]/(df_fe["temperature"]+1)

#Water Availability Score
df_fe["water_availability"]=(df_fe["rainfall"]/400)*(df_fe["humidity"]/100)

In [38]:
print("\nCreated 5 water-based features:")
water_features = ['rainfall_category', 'water_stress_index', 'moisture_index', 
                  'rainfall_per_temp', 'water_availability']
for feat in water_features:
    if df_fe[feat].dtype == 'object':
        print(f"  • {feat}: Categories = {df_fe[feat].unique()}")
    else:
        print(f"  • {feat}: Range [{df_fe[feat].min():.2f}, {df_fe[feat].max():.2f}]")


Created 5 water-based features:
  • rainfall_category: Categories = ['High' 'Medium' 'Low']
  • water_stress_index: Range [0.07, 1.40]
  • moisture_index: Range [9.39, 253.22]
  • rainfall_per_temp: Range [0.66, 13.55]
  • water_availability: Range [0.02, 0.63]
