In [1]:
# ===============================
# 1. IMPORT LIBRARIES
# ===============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# ===============================
# 2. LOAD DATASET
# ===============================
data = pd.read_csv('/content/Agriculture_price_dataset.csv')

# Preview data
print("Dataset shape:", data.shape)
print(data.head())

# ===============================
# 3. DATA CLEANING
# ===============================
# Check missing values
print("Missing values:\n", data.isnull().sum())

# Drop rows with missing target values
data = data.dropna(subset=['Modal_Price'])

# Fill missing categorical columns with 'Unknown'
categorical_cols = ['STATE', 'District Name', 'Market Name', 'Commodity', 'Variety']
for col in categorical_cols:
    data[col] = data[col].fillna('Unknown')

# Convert 'Price Date' to datetime
data['Price Date'] = pd.to_datetime(data['Price Date'], errors='coerce')
data = data.dropna(subset=['Price Date'])  # drop rows where date conversion failed

# Extract Month and Year
data['Month'] = data['Price Date'].dt.month
data['Year'] = data['Price Date'].dt.year

# ===============================
# 4. DATA PREPROCESSING
# ===============================
# Encode categorical variables
le_state = LabelEncoder()
le_district = LabelEncoder()
le_commodity = LabelEncoder()
le_variety = LabelEncoder()

data['State_enc'] = le_state.fit_transform(data['STATE'])
data['District_enc'] = le_district.fit_transform(data['District Name'])
data['Commodity_enc'] = le_commodity.fit_transform(data['Commodity'])
data['Variety_enc'] = le_variety.fit_transform(data['Variety'])

# Features and target
X = data[['State_enc', 'District_enc', 'Commodity_enc', 'Variety_enc', 'Month', 'Year']]
y = data['Modal_Price']

# ===============================
# 5. SPLIT DATA INTO TRAIN & TEST
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# 6. BUILD AND TRAIN MODEL
# ===============================
model = RandomForestRegressor(
    n_estimators=200, random_state=42, n_jobs=-1
)
model.fit(X_train, y_train)

# ===============================
# 7. MODEL EVALUATION
# ===============================
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", round(mse, 2))
print("R2 Score:", round(r2, 2))

# Optional: Plot predicted vs actual
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Crop Prices")
plt.show()

# ===============================
# 8. SAVE MODEL AND ENCODERS
# ===============================
joblib.dump(model, "crop_price_model.pkl")
joblib.dump(le_state, "le_state.pkl")
joblib.dump(le_district, "le_district.pkl")
joblib.dump(le_commodity, "le_commodity.pkl")
joblib.dump(le_variety, "le_variety.pkl")

print("Model and encoders saved successfully!")

# ===============================
# 9. OPTIONAL: MARKET ANALYSIS
# ===============================
# Example: Price trend of a crop
crop_name = "Tomato"  # replace with your crop
crop_df = data[data['Commodity'] == crop_name].groupby('Month')['Modal_Price'].mean()

plt.figure(figsize=(10,5))
sns.lineplot(x=crop_df.index, y=crop_df.values, marker='o')
plt.title(f'Monthly Price Trend of {crop_name}')
plt.xlabel("Month")
plt.ylabel("Average Modal Price")
plt.show()


           STATE District Name        Market Name Commodity           Variety  \
0    Maharashtra        nashik  Lasalgaon(Niphad)     Wheat  Maharashtra 2189   
1    Maharashtra        satara              Patan    Tomato             Other   
2  Uttar Pradesh      mainpuri              Bewar    Potato             Local   
3      Rajasthan   chittorgarh          Nimbahera     Wheat             Other   
4      Rajasthan    pratapgarh         Pratapgarh     Onion             Other   

  Grade  Min_Price  Max_Price  Modal_Price Price Date  
0   FAQ     2172.0     2399.0       2300.0   6/6/2023  
1   FAQ     1000.0     1500.0       1250.0   6/6/2023  
2   FAQ      800.0      820.0        810.0   6/6/2023  
3   FAQ     2040.0     2668.0       2300.0   6/6/2023  
4   FAQ      476.0     1043.0        617.0   6/6/2023  


In [None]:
# ===============================
# 2. LOAD DATASET
# ===============================
data = pd.read_csv('/content/Agriculture_price_dataset.csv')

# Preview data
print("Dataset shape:", data.shape)
print(data.head())


STATE            0
District Name    0
Market Name      0
Commodity        0
Variety          0
Grade            0
Min_Price        0
Max_Price        0
Modal_Price      0
Price Date       0
dtype: int64


In [None]:
# Convert 'Arrival Date' to datetime format
df['Arrival Date'] = pd.to_datetime(df['Arrival Date'])

# Extract 'Month' and 'Year' from 'Arrival Date'
df['Month'] = df['Arrival Date'].dt.month
df['Year'] = df['Arrival Date'].dt.year


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Encode categorical columns
df['State'] = le.fit_transform(df['State'])
df['District'] = le.fit_transform(df['District'])
df['Commodity'] = le.fit_transform(df['Commodity'])
df['Variety'] = le.fit_transform(df['Variety'])


In [None]:
# Define features and target variable
X = df[['State', 'District', 'Commodity', 'Variety', 'Month', 'Year']]
y = df['Modal Price']


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'crop_price_model.pkl')

# Save the label encoders
joblib.dump(le, 'label_encoder.pkl')
