<a href="https://colab.research.google.com/github/tulsisahu9785-cpu/EDA-/blob/main/Crop_Yield_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
patelris_crop_yield_prediction_dataset_path = kagglehub.dataset_download('patelris/crop-yield-prediction-dataset')

print('Data source import complete.')


Introduction to Crop Yield Prediction
Crop yield prediction is a critical application of data science and machine learning in agriculture. It involves forecasting the amount of crop production (yield) based on various environmental, climatic, and agricultural factors. Accurate predictions help farmers optimize resource allocation, governments plan food security policies, and businesses manage supply chains. In this notebook, we use a dataset from Kaggle that includes features like area, item (crop type), year, rainfall, pesticides usage, and average temperature to build predictive models.

![image.png](attachment:78f6514d-0ad3-4f1d-af4b-af948b5154be.png)![image.png](attachment:e7e5af5a-11a0-44b6-81cf-99949af66352.png)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,r2_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import pickle


In [None]:
df=pd.read_csv('/kaggle/input/crop-yield-prediction-dataset/yield_df.csv')
df

In [None]:
df.columns

In [None]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
df.info()

In [None]:
def unique_val(df, columns):
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column {col} not found in DataFrame")
    return {col: df[col].unique().tolist() for col in columns}

In [None]:
unique_val(df, ['Area', 'Item', 'Year'])

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
plt.figure(figsize=(12,12))
plt.plot(df['Year'],df['hg/ha_yield'])
plt.show()

In [None]:
corr_=df[['Year', 'hg/ha_yield','average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']].corr()
plt.figure(figsize=(6,6))
sns.heatmap(corr_,annot=True)
plt.show()

In [None]:
plt.figure(figsize=(24,12))
sns.boxplot(x='Area', y='hg/ha_yield', data=df)
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(24,24))
sns.countplot(y=df['Area'])
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(y=df['Item'])
plt.show()

In [None]:
X=df.drop(['hg/ha_yield'],axis=True)
Y=df['hg/ha_yield']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=.05,shuffle=True,random_state=42)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
encoder=OneHotEncoder(drop='first')
scaleing=StandardScaler()

In [None]:
x_train.head(5)

In [None]:
preprocesser = ColumnTransformer(
        transformers = [
            ('StandardScale', scaleing, [2,3,4,5]),
            ('OHE', encoder, [0, 1]),
        ],
        remainder='passthrough'
)

In [None]:
preprocesser

In [None]:
dummy_x_train=preprocesser.fit_transform(x_train)
dummy_x_test=preprocesser.transform(x_test)

In [None]:
models = {
    'lr': LinearRegression(),
    'lss': Lasso(max_iter=10000),
     'Rid': Ridge(solver="lsqr"),

    'Dtr': DecisionTreeRegressor()
}

for name, md in models.items():
    md.fit(dummy_x_train, y_train)
    y_pred = md.predict(dummy_x_test)
    print(f"{name} : mae : {mean_absolute_error(y_test, y_pred)} score : {r2_score(y_test, y_pred)}")


In [None]:
dtr=DecisionTreeRegressor()
dtr.fit(dummy_x_train,y_train)


In [None]:
pred=dtr.predict(dummy_x_test)

In [None]:
print(r2_score(pred,y_test))

In [None]:
x_train.head(1)

In [None]:
def prediction(Area,Item,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp):
    feature=np.array([[Area,Item,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp]])
    trans_feat=preprocesser.transform(feature)
    return (dtr.predict(trans_feat)).reshape(1,-1)

In [None]:
Year = 1990
average_rain_fall_mm_per_year =1485.0
pesticides_tonnes = 121.00
avg_temp = 16.37
Area = 'Albania'
Item = 'Maize'
result = prediction(Area,Item,Year,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp)

In [None]:
result

In [None]:
pickle.dump(dtr,open('dtr.pkl','wb'))
pickle.dump(preprocesser,open('preprocessor.pkl','wb'))