Ridge regressor 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Load the dataset
data = pd.read_csv("yield_df.csv") 

# Inspect the dataset
print(data.head())

# Encode categorical features
# One-hot encode 'Area' and 'Item' columns
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use 'sparse_output' instead of 'sparse'
 # Drop first to avoid multicollinearity
categorical_features = encoder.fit_transform(data[['Area', 'Item']])
categorical_columns = encoder.get_feature_names_out(['Area', 'Item'])

# Combine encoded features with numerical features
numerical_features = data[['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']]
X = np.hstack((categorical_features, numerical_features))
X_columns = list(categorical_columns) + numerical_features.columns.tolist()

# Target variable
y = data['hg/ha_yield']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)  # Adjust alpha as needed
ridge_model.fit(X_train, y_train)

# Make predictions
y_train_pred = ridge_model.predict(X_train)
y_test_pred = ridge_model.predict(X_test)

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2 = r2_score(y_test, y_test_pred)

print("Training RMSE:", train_rmse)
print("Testing RMSE:", test_rmse)
print("R-squared (Test):", r2)

# Optional: Display feature coefficients
coefficients = pd.DataFrame({'Feature': X_columns, 'Coefficient': ridge_model.coef_})
print(coefficients.sort_values(by='Coefficient', ascending=False))


In [None]:
from sklearn.linear_model import LinearRegression

model_results = {
    "Model": [],
    "MSE": [],
    "MAE": [],
    "R²": [],
    "Accuracy": []
}

# Apply PCA (same as above)
pca = PCA(n_components=10)  # Adjust n_components as needed
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train Principal Component Regression
pcr_model = LinearRegression()
pcr_model.fit(X_train_pca, y_train)

# Predictions
y_train_pred = pcr_model.predict(X_train_pca)
y_test_pred = pcr_model.predict(X_test_pca)

# Evaluate
print("Principal Component Regression Metrics:")
print(f"Training MSE: {mean_squared_error(y_train, y_train_pred):.2f}")
print(f"Testing MSE: {mean_squared_error(y_test, y_test_pred):.2f}")
print(f"Testing R²: {r2_score(y_test, y_test_pred):.2f}")

model_results["Model"].append("PCR")
model_results["MSE"].append(test_mse)  # Replace with your Ridge Regression test MSE
model_results["MAE"].append(test_mae)  # Replace with your Ridge Regression test MAE
model_results["R²"].append(test_r2)    # Replace with your Ridge Regression test R²
model_results["Accuracy"].append(accuracy)  # Replace with your Ridge Regression accuracy
results_df = pd.DataFrame(model_results)

In [None]:
#RR model 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler



# One-hot encode 'Area' and 'Item' columns
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Use 'sparse_output' instead of 'sparse'
categorical_features = encoder.fit_transform(data[['Area', 'Item']])
categorical_columns = encoder.get_feature_names_out(['Area', 'Item'])

# Combine encoded features with numerical features
numerical_features = data[['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']]
X = np.hstack((categorical_features, numerical_features))
#X_columns = list(categorical_columns) + numerical_features.columns.tolist()

# Target variable
y = data['hg/ha_yield']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)  # Adjust alpha as needed
ridge_model.fit(X_train, y_train)

# Make predictions
y_train_pred = ridge_model.predict(X_train)
y_test_pred = ridge_model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred) / y_test)))

print("Training Metrics:")
print(f"  MSE: {train_mse:.2f}")
print(f"  MAE: {train_mae:.2f}")
print(f"  R-squared: {train_r2:.2f}")
    
print("Testing Metrics:")
print(f"  MSE: {test_mse:.2f}")
print(f"  MAE: {test_mae:.2f}")
print(f"  R-squared: {test_r2:.2f}")
print(f"  Accuracy: {accuracy:.2f}%")

y_test_pred = np.random.rand(100) * 1000  # Placeholder for predicted values

# Metrics
#mse = mean_squared_error(y_test, y_test_pred)
#mae = mean_absolute_error(y_test, y_test_pred)
#r2 = r2_score(y_test, y_test_pred)
#accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred) / y_test)))

# Append metrics
model_results["Model"].append("RR")
model_results["MSE"].append(test_mse)
model_results["MAE"].append(test_mae)
model_results["R²"].append(test_r2)
model_results["Accuracy"].append(accuracy)

# Append actual and predicted values
validate_lengths(y_test, y_test_pred, "RR")
comparison_data["Model"].extend(["RR"] * len(y_test))
comparison_data["Actual"].extend(y_test)
comparison_data["Predicted"].extend(y_test_pred)

# Optional: Display feature coefficients
coefficients = pd.DataFrame({'Feature': X_columns, 'Coefficient': ridge_model.coef_})
print(coefficients.sort_values(by='Coefficient', ascending=False))


In [None]:
#PCA + RF
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler



# Load and preprocess the dataset
data = pd.read_csv("yield_df.csv")  # Replace with your dataset file path
encoder = OneHotEncoder(sparse_output=False, drop='first')
categorical_features = encoder.fit_transform(data[['Area', 'Item']])
categorical_columns = encoder.get_feature_names_out(['Area', 'Item'])
numerical_features = data[['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']]
X = np.hstack((categorical_features, numerical_features))
y = data['hg/ha_yield']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=10)  # Adjust n_components as needed
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_pca, y_train)

# Predictions
y_train_pred = rf_model.predict(X_train_pca)
y_test_pred = rf_model.predict(X_test_pca)

# Evaluate
print("Random Forest + PCA Metrics:")
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred) / y_test)))

print("Training Metrics:")
print(f"  MSE: {train_mse:.2f}")
print(f"  MAE: {train_mae:.2f}")
print(f"  R-squared: {train_r2:.2f}")
    
print("Testing Metrics:")
print(f"  MSE: {test_mse:.2f}")
print(f"  MAE: {test_mae:.2f}")
print(f"  R-squared: {test_r2:.2f}")
print(f"  Accuracy: {accuracy:.2f}%")

#print(f"Training MSE: {mean_squared_error(y_train, y_train_pred):.2f}")
#print(f"Testing MSE: {mean_squared_error(y_test, y_test_pred):.2f}")
#rint(f"Testing R²: {r2_score(y_test, y_test_pred):.2f}")
#accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred) / y_test)))
#print(f"  Accuracy: {accuracy:.2f}%")
y_test_pred = np.random.rand(100) * 1000  # Placeholder for predicted values

# Metrics
#mse = mean_squared_error(y_test, y_test_pred)
#mae = mean_absolute_error(y_test, y_test_pred)
#r2 = r2_score(y_test, y_test_pred)
#accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred) / y_test)))

# Append metrics
model_results["Model"].append("PCA+RF")
model_results["MSE"].append(test_mse)
model_results["MAE"].append(test_mae)
model_results["R²"].append(test_r2)
model_results["Accuracy"].append(accuracy)

# Append actual and predicted values
validate_lengths(y_test, y_test_pred, "PCA+RF")
comparison_data["Model"].extend(["PCA+RF"] * len(y_test))
comparison_data["Actual"].extend(y_test)
comparison_data["Predicted"].extend(y_test_pred)



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Build the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
y_test_pred = model.predict(X_test)

# Evaluate metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred.flatten()) / y_test)))

# Print metrics
print("\nModel Evaluation Metrics:")
print(f"Testing MSE: {test_mse:.2f}")
print(f"Testing MAE: {test_mae:.2f}")
print(f"Testing R²: {test_r2:.2f}")
print(f"Testing Accuracy: {accuracy:.2f}%")

y_test_pred = np.random.rand(100) * 1000  # Placeholder for predicted values

# Metrics
#mse = mean_squared_error(y_test, y_test_pred)
#mae = mean_absolute_error(y_test, y_test_pred)
#r2 = r2_score(y_test, y_test_pred)
#accuracy = 100 * (1 - np.mean(np.abs((y_test - y_test_pred) / y_test)))

# Append metrics
model_results["Model"].append("FFNN")
model_results["MSE"].append(test_mse)
model_results["MAE"].append(test_mae)
model_results["R²"].append(test_r2)
model_results["Accuracy"].append(accuracy)

# Append actual and predicted values
validate_lengths(y_test, y_test_pred, "Feedforward Neural Network")
comparison_data["Model"].extend(["FFNN"] * len(y_test))
comparison_data["Actual"].extend(y_test)
comparison_data["Predicted"].extend(y_test_pred)



In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
correlation_matrix=yield_df.select_dtypes(include=[np.number]).corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

# Select features with high correlation to the target
correlation_threshold = 0.2  # Adjust threshold based on your dataset
correlated_features = correlation_matrix['hg/ha_yield'][abs(correlation_matrix['hg/ha_yield']) > correlation_threshold].index
print("Selected Features Based on Correlation:", correlated_features.tolist())


NameError: name 'yield_df' is not defined