# ML Model Training Notebook

This notebook trains three ML models for salary, house price, and crop yield predictions.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import joblib
import os

## Create directories

In [None]:
os.makedirs('models', exist_ok=True)

## 1. Salary Prediction Model (Linear Regression)

In [None]:
# Load data
salary_data = pd.read_csv('data/salary_data.csv')
salary_data.head()

In [None]:
# Prepare features and target
X = salary_data[['years_experience']]
y = salary_data['salary']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
salary_model = LinearRegression()
salary_model.fit(X_train, y_train)

# Evaluate
y_pred = salary_model.predict(X_test)
print(f"Salary Model R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Salary Model MAE: ${mean_absolute_error(y_test, y_pred):,.2f}")

# Save model
joblib.dump(salary_model, 'models/salary_model.pkl')

## 2. House Price Model (Random Forest)

In [None]:
# Load data
house_data = pd.read_csv('data/house_data.csv')
house_data.head()

In [None]:
# Encode location
location_map = {'rural': 0, 'suburban': 1, 'urban': 2}
house_data['location_encoded'] = house_data['location'].map(location_map)

# Prepare features and target
X = house_data[['area', 'bedrooms', 'location_encoded']]
y = house_data['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
house_model = RandomForestRegressor(n_estimators=100, random_state=42)
house_model.fit(X_train, y_train)

# Evaluate
y_pred = house_model.predict(X_test)
print(f"House Model R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"House Model MAE: ${mean_absolute_error(y_test, y_pred):,.2f}")

# Save model and location map
joblib.dump(house_model, 'models/house_model.pkl')
joblib.dump(location_map, 'models/location_map.pkl')

## 3. Crop Yield Model (Decision Tree)

In [None]:
# Load data
crop_data = pd.read_csv('data/crop_data.csv')
crop_data.head()

In [None]:
# Prepare features and target
X = crop_data[['rainfall', 'temperature']]
y = crop_data['yield']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
crop_model = DecisionTreeRegressor(random_state=42)
crop_model.fit(X_train, y_train)

# Evaluate
y_pred = crop_model.predict(X_test)
print(f"Crop Model R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Crop Model MAE: {mean_absolute_error(y_test, y_pred):.4f}")

# Save model
joblib.dump(crop_model, 'models/crop_model.pkl')

## Summary

All models have been trained and saved to the `models/` folder:
- `salary_model.pkl` - Linear Regression
- `house_model.pkl` - Random Forest
- `crop_model.pkl` - Decision Tree
- `location_map.pkl` - Location encoding mapping