In [13]:
import pandas as pd

In [14]:
df_soil = pd.read_csv("soil_dataset_final.csv")
df_sensor = pd.read_csv("soil_data.csv")

In [15]:
print(df_sensor)

           time  moisture  temperature  ec   ph  nitrogen  phosphorus  \
0    1770359658       0.0         22.0   0  6.2         0           0   
1    1770359658       0.0         22.0   0  6.2         0           0   
2    1770359658       0.0         22.0   0  6.2         0           0   
3    1770359658       0.0         22.0   0  6.2         0           0   
4    1770359659       0.0         22.0   0  6.2         0           0   
..          ...       ...          ...  ..  ...       ...         ...   
889  1770360319       0.0         22.2   0  6.1         0           0   
890  1770360319       0.0         22.2   0  6.1         0           0   
891  1770360320       0.0         22.2   0  6.1         0           0   
892  1770360320       0.0         22.2   0  6.1         0           0   
893  1770360320       0.0         22.2   0  6.1         0           0   

     potassium  salinity  tds  
0            0       0.0    0  
1            0       0.0    0  
2            0       0.0   

In [16]:
df_sensor.drop(columns = ['time', 'temperature','nitrogen', 'phosphorus', 'potassium'], inplace = True)
df_sensor.head()

Unnamed: 0,moisture,ec,ph,salinity,tds
0,0.0,0,6.2,0.0,0
1,0.0,0,6.2,0.0,0
2,0.0,0,6.2,0.0,0
3,0.0,0,6.2,0.0,0
4,0.0,0,6.2,0.0,0


In [17]:
df_soil.drop(columns=['OC'], inplace=True)
df_soil.rename(columns = {'EC' : 'ec',
                          'pH' : 'ph'}, inplace = True)
df_soil.head()

Unnamed: 0,ph,ec,salinity,tds,moisture
0,77.0,220.0,243.0,241.0,165.3
1,77.0,220.0,218.0,239.2,168.4
2,77.0,218.0,218.0,236.0,162.8
3,77.0,224.0,260.0,247.0,170.7
4,77.0,217.0,264.0,243.8,173.0


In [19]:
df_soil.reindex(columns = ['moisture','ec',	'ph',	'salinity',	'tds'])

Unnamed: 0,moisture,ec,ph,salinity,tds
0,165.3,220.0,77.0,243.0,241.0
1,168.4,220.0,77.0,218.0,239.2
2,162.8,218.0,77.0,218.0,236.0
3,170.7,224.0,77.0,260.0,247.0
4,173.0,217.0,77.0,264.0,243.8
...,...,...,...,...,...
2612229,216.3,145.0,53.0,410.0,235.2
2612230,183.7,151.0,52.0,389.0,226.0
2612231,181.5,203.0,53.0,413.0,257.6
2612232,162.6,150.0,52.0,378.0,218.0


Model Beign

In [20]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import joblib


In [21]:
features = ['moisture','ec',	'ph',	'salinity',	'tds']

In [22]:
scaler = StandardScaler()
X_soil_scaled = scaler.fit_transform(df_soil[features])


In [25]:
"""
  To create soil classes

  Class 0 → Saline Soil
  Class 1 → Balanced Soil
  Class 2 → Dry Soil
  Class 3 → Acidic Soil

"""

kmeans = KMeans(n_clusters=4, random_state=42)

df_soil["soil_class"] = kmeans.fit_predict(X_soil_scaled)


In [32]:
df_soil = df_soil.sample(frac=0.25, random_state=42)
df_soil.shape

(19592, 6)

In [33]:
"""
  Model Pretraining
"""

rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf_model.fit(df_soil[features], df_soil["soil_class"])


In [34]:
"""
  Predicting classes for sensor data
"""
X_sensor_scaled = scaler.transform(df_sensor[features])

df_sensor["soil_class"] = rf_model.predict(df_sensor[features])


In [35]:
"""
  Combining Datasets
"""

combined_df = pd.concat([
    df_soil[features + ["soil_class"]],
    df_sensor[features + ["soil_class"]]
], ignore_index=True)


In [36]:
"""
  Now we create sample weights so that our sensor data is weighed more
  over the dataset taken for training. This helps us train the model better as
  per local contraints and significant amount of data.
"""

soil_weights = np.ones(len(df_soil))
sensor_weights = np.ones(len(df_sensor)) * 3.0

sample_weights = np.concatenate([soil_weights, sensor_weights])


In [37]:
model_adapted = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

model_adapted.fit(
    combined_df[features],
    combined_df["soil_class"],
    sample_weight=sample_weights
)


In [38]:
joblib.dump(model_adapted, "soil_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']