In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


In [2]:

# Reload data and rename columns (as done in Week 1)
df = pd.read_csv('datafile (1).csv')
new_columns = {
    'Cost of Cultivation (`/Hectare) A2+FL': 'Cost_Cultivation_A2_FL',
    'Cost of Cultivation (`/Hectare) C2': 'Cost_Cultivation_C2',
    'Cost of Production (`/Quintal) C2': 'Cost_Production_C2',
    'Yield (Quintal/ Hectare) ': 'Yield_Quintal_Hectare'
}
df.rename(columns=new_columns, inplace=True)

In [3]:
# Y is the target variable (Yield)
Y = df['Yield_Quintal_Hectare']

# X contains all features (drop the target column Y)
X = df.drop('Yield_Quintal_Hectare', axis=1)

# Identify which columns are text (categorical)
categorical_features = ['Crop', 'State']

In [4]:
# Perform One-Hot Encoding on categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

print("--- Encoded Features Head (X_encoded) ---")
print(X_encoded.head())

# Note: The shape of your data will now have many more columns (one for each Crop and State).
print(f"Total features after encoding: {X_encoded.shape[1]}")

--- Encoded Features Head (X_encoded) ---
   Cost_Cultivation_A2_FL  Cost_Cultivation_C2  Cost_Production_C2  \
0                 9794.05             23076.74             1941.55   
1                10593.15             16528.68             2172.46   
2                13468.82             19551.90             1898.30   
3                17051.66             24171.65             3670.54   
4                17130.55             25270.26             2775.80   

   Crop_COTTON  Crop_GRAM  Crop_GROUNDNUT  Crop_MAIZE  Crop_MOONG  Crop_PADDY  \
0        False      False           False       False       False       False   
1        False      False           False       False       False       False   
2        False      False           False       False       False       False   
3        False      False           False       False       False       False   
4        False      False           False       False       False       False   

   Crop_RAPESEED AND MUSTARD  ...  State_Haryana  

In [5]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(
    X_encoded, Y, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)} rows")
print(f"Testing set size: {len(X_test)} rows")

Training set size: 39 rows
Testing set size: 10 rows


In [6]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the training data
model.fit(X_train, Y_train)

# This is a major milestone for your report!
print("\n--- Model Training Complete ---")


--- Model Training Complete ---


In [7]:
# Make predictions on the test set
Y_pred = model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))

# Calculate R-squared (measures the proportion of variance explained)
r_squared = model.score(X_test, Y_test)

print(f"\n--- Model Evaluation ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-Squared Score: {r_squared:.2f}")


--- Model Evaluation ---
Root Mean Squared Error (RMSE): 62.55
R-Squared Score: 0.96
