# Predictive Modelling for Agriculture

## Read the data into a pandas DataFrame and perform exploratory data analysis

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [11]:
crops = pd.read_csv("soil_measures.csv")

In [12]:
crops.isna().sum()

N       0
P       0
K       0
ph      0
crop    0
dtype: int64

In [13]:
crops.crop.unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [14]:
X = crops.drop(columns="crop")
y = crops["crop"]

## Split the data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

## Evaluate feature performance

In [16]:
feature_performance = {}

In [17]:
for feature in ["N", "P", "K", "ph"]:
    log_reg = LogisticRegression(multi_class="multinomial")
    log_reg.fit(X_train[[feature]], y_train)
    y_pred = log_reg.predict(X_test[[feature]])
    
    f1 = metrics.f1_score(y_test, y_pred, average="weighted")
    
    feature_performance[feature] = f1
    print(f"F1-score for {feature}: {f1}")

F1-score for N: 0.09149868209906838
F1-score for P: 0.14761942909728204
F1-score for K: 0.23896974566001802
F1-score for ph: 0.04532731061152114


## Create the best_predictive_feature variable

In [18]:
best_predictive_feature = {"K": feature_performance["K"]}
best_predictive_feature

{'K': 0.23896974566001802}