# CSC 310 Final Project: Crop Yield Prediction App

Group Members: Ryan Jensen, Timothy Hourihan


## Data Preprocessing

#### Importing necessary libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import geopandas as gpd
import plotly.express as px
# from google.colab import drive # Uncomment if you are running in a Google Colab.

### Reading the Dataset

In [None]:
# Uncomment below if you are running in Google Colab. You should have the dataset stored on your drive. (Note: You can't run Gradio in Google Colab)
# drive.mount('/content/drive')
# df=pd.read_csv("/content/drive/MyDrive/yield_df.csv")

# The following code is if you are running this locally.
df = pd.read_csv("data/yield_df.csv" )

df.info()

In [None]:
df.head(10)

#### Removing the unnamed column

In [None]:
df.drop("Unnamed: 0", axis=1,inplace=True)
df.info()

#### Changing column names to fit our project

In [None]:
df = df.rename(columns = {"Area":"country"})
df = df.rename(columns = {"Item":"crop"})
df = df.rename(columns = {"Year":"year"})
df.head(10)

In [None]:
df.isna().sum(axis=0)

### Top 10 Countries with Highest Yield

In [None]:
df.groupby(['country'],sort=True)['hg/ha_yield'].sum().nlargest(10)

India and Brazil has far and beyond the highest yield, so it's important to note their average temperatures, rainfall, and pesticide use.

In [None]:
df_india = df[df['country'] == 'India']
df_india.head(10)
df_india.describe()

India has an average of 1083 mm rainfall, 48459 tonnes of pesticide, and a 26 degree celsius temperature to produce the highest yield by a large margin.

In [None]:
df_india = df[df['country'] == 'Brazil']
df_india.head(10)
df_india.describe()

Brazil has an average of 1761 mm rainfall, 189736 tonnes of pesticide, and a 22.7 degree celsius temperature to produce the second highest yield.

### Histograms

In [None]:
df.hist(figsize=(5,10));

The majority of pesticide usage according to the histograms is actually close to none. We know that India and Brazil use a lot of pesticide and have the two highest yields, so pesticide use is an important variable here.

The majority of rainfall seems to slow down after 2000mm, with most of the rainfall coming before 2000mm

Average temperature is around 25 degrees celsius

The overall yield is also close to none in most cases, which confirms the validity of the data set since conditions have to be perfect to have a good yield to harvest/sell.

### Scatter Matrix

In [None]:
import seaborn as sns
sns.pairplot(df)

None of these scatter plots in the scatter matrix shows an incredibly strong correlation between any two column, so no deletion is required.

### Heat Map

In [None]:
cor = df.select_dtypes(['int64','float64']).corr()
sns.heatmap(cor,cmap = 'YlOrRd',annot = True)
plt.title('Heatmap')

This heat map further shows that there isn't really a very strong correlation with any of the columns, so no deletion required.

While there isn't a strong correlation, we can see that average rain_fall and average_temp can affect eachother. This makes sense because countries with a lot of rain fall typically have slightly cooler climates and vice versa. However, they are not dependent on eachother so we have no need to alter any of the data.

## Building Model

#### Importing necessary libraries

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

#### Preparing the data for moedeling

In [None]:
# Separating dataset into features and target
X = df.drop('hg/ha_yield', axis=1)
y = df['hg/ha_yield']

# Encoding categorical data
label_encoders = {}
for column in ['country', 'crop']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Training and evaluating models

In [None]:
models = [
    ('Linear Regression', LinearRegression()),
    ('Gradient Boost', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
    ('XGBoost', XGBRegressor(random_state=42)),
]

results = []
fig, axs = plt.subplots(len(models), figsize=(10, 20))

for idx, (name, model) in enumerate(models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    MSE = mean_squared_error(y_test, y_pred)
    R2_score = r2_score(y_test, y_pred)
    results.append((name, accuracy, MSE, R2_score))

    # Plotting
    axs[idx].scatter(y_test, y_pred, s=10, color='#1f77b4')  # Blue color
    axs[idx].plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='#ff7f0e', linewidth=2)  # Orange color
    axs[idx].set_title(f'{name} Evaluation')
    axs[idx].set_xlabel('Actual Values')
    axs[idx].set_ylabel('Predicted Values')

plt.tight_layout()
plt.show()

# Displaying results in a DataFrame
df_results = pd.DataFrame(results, columns=['Model', 'Accuracy', 'MSE', 'R2_score'])
display(df_results)


#### KFold Validation

In [None]:
results = []

models = [
    ('Linear Regression', LinearRegression()),
    ('Gradient Boost', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
    ('XGBoost', XGBRegressor(random_state=42)),
    ('KNN', KNeighborsRegressor(n_neighbors=5)),
    ('Decision Tree', DecisionTreeRegressor(random_state=42))
]

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    MSE = mean_squared_error(y_test, y_pred)
    MAE = mean_absolute_error(y_test, y_pred)
    MAPE = mean_absolute_percentage_error(y_test, y_pred)
    R2_score = r2_score(y_test, y_pred)
    results.append((name, accuracy, MSE, MAE, MAPE, R2_score))

    # KFold Validation
    kf = KFold(n_splits=5, shuffle=True)
    scores = cross_val_score(model, X, y, cv=kf)
    mean_score = np.mean(scores)
    print(f"{name} - Mean CV Score: {mean_score}")

## Model Deployment with Gradio

#### Defining a Predict Function

In [None]:
def predict_yield(crop_type, rainfall, country, pesticides, temperature):
    try:
        # Convert inputs to the correct data types
        rainfall = float(rainfall)
        pesticides = float(pesticides)
        temperature = float(temperature)

        # Apply the same LabelEncoder transformations as during model training
        encoded_crop = label_encoders['crop'].transform([crop_type])[0]
        encoded_country = label_encoders['country'].transform([country])[0]

        # Use a default year, 2000 is good
        default_year = 2000

        # Create a DataFrame for the input with the correct column order
        input_data = pd.DataFrame([[encoded_country, encoded_crop, default_year, rainfall, pesticides, temperature]],
                                  columns=['country', 'crop', 'year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp'])

        # Make a prediction
        prediction = xgb_model.predict(input_data)[0]

        prediction_with_units = f"{prediction:.2f} hg/ha" 
        return prediction_with_units
    except Exception as e:
        print(f"Error in prediction: {e}")
        raise e


In [None]:
# Example test
test_prediction = predict_yield("Wheat", 100, "Ecuador", 20, 25)
print(test_prediction)


#### Creating Gradio Interface

In [None]:
# Extracting unique values for 'country' and 'crop' columns so we can display them in the dropdown as options
unique_countries = df['country'].unique().tolist()
unique_crop_types = df['crop'].unique().tolist()

# Sorting lists
unique_countries.sort()
unique_crop_types.sort()


iface = gr.Interface(
    fn=predict_yield,
    inputs=[
        gr.Dropdown(choices=unique_crop_types, label="Crop Type"),
        gr.Number(label="Rainfall (mm)"),
        gr.Dropdown(choices=unique_countries, label="Country"),
        gr.Number(label="Pesticides (tonnes)"),
        gr.Number(label="Average Temperature (°C)")
    ],
    outputs="text",
    title="Crop Yield Prediction",
    description="Select the parameters to predict the crop yield.",
    allow_flagging=False
)

iface.launch()
