In [9]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import f1_score
import warnings
from sklearn import metrics
warnings.filterwarnings("ignore")

# Load the dataset
crops = pd.read_csv("./soil_measures.csv")

#### <b> Predição 1 </b> - Com base nas features (N, P, K, Ph) que representa a composição do solo, o modelo tentara prever quais serão o tipo de plantação (café, arroz, banana, etc...)

In [10]:
X = crops.drop(columns='crop')
y = crops['crop']

In [11]:
y.unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [12]:
y_dummie = pd.get_dummies(y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_dummie, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
y_train_dummie = pd.get_dummies(y_train)
y_test_dummie = pd.get_dummies(y_test)

In [16]:
feature_performance = {}

for target_values in y_train.columns:
    logreg = LogisticRegression()
    logreg.fit(X_train_scaled, y_train[target_values].values.reshape(-1, 1))
    y_pred = logreg.predict(X_test_scaled)
    f1_score_result = metrics.f1_score(y_test[target_values].values, y_pred, average='weighted')
    feature_performance[target_values] = f1_score_result
    print(f"F1-score for {target_values}: {f1_score_result}")
    
feature_performance_ordered = sorted(feature_performance.items(), key=lambda x: x[1], reverse=True)

F1-score for apple: 0.9454867894030473
F1-score for banana: 0.9838026748665049
F1-score for blackgram: 0.9393928118393234
F1-score for chickpea: 0.9300794885366852
F1-score for coconut: 0.9392669883918091
F1-score for coffee: 0.934031209893279
F1-score for cotton: 0.9885372908388075
F1-score for grapes: 0.9606520959172742
F1-score for jute: 0.9269812759467931
F1-score for kidneybeans: 0.9356598790150805
F1-score for lentil: 0.9686657051317341
F1-score for maize: 0.9368097330179115
F1-score for mango: 0.9299234481443951
F1-score for mothbeans: 0.934031209893279
F1-score for mungbean: 0.9296631115625529
F1-score for muskmelon: 0.9569550996134405
F1-score for orange: 0.9884696651985031
F1-score for papaya: 0.9162717918190323
F1-score for pigeonpeas: 0.9222229845626072
F1-score for pomegranate: 0.9375028480291637
F1-score for rice: 0.9323467230443974
F1-score for watermelon: 0.9717168787856818


#### <b> Predição 2 </b> - Com base em cada uma das features (N, P, K, Ph) treinadas individuamente, o modelo tentara prever a coluna target ("crop")

In [17]:
X = crops.drop('crop', axis=1)
y = crops['crop']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [21]:
features_names = X.columns.unique()
feature_performance = {}

for fn in features_names:
    model = LogisticRegression(multi_class='multinomial')
    model.fit(X_train[fn].values.reshape(-1,1), y_train.values.reshape(-1,1))
    y_pred = model.predict(X_test[fn].values.reshape(-1,1))
    feature_performance[fn] = metrics.f1_score(y_test.values.reshape(-1, 1), y_pred, average='weighted')
    print(f"F1-score for {fn}: {feature_performance[fn]}")

best_predictive_feature = sorted(feature_performance.items(), key=lambda x: x[1], reverse=True)
    

F1-score for N: 0.09576259237897118
F1-score for P: 0.12412171549213565
F1-score for K: 0.23186703393490551
F1-score for ph: 0.06551334017644783


In [31]:
max(feature_performance.items(), key=lambda x: x[1])

('K', 0.23186703393490551)