# Author: Wajid Umar
## Project: Prediction of grassland area in different countries around the globe
### Algorithm: Random Forest Regressor

# Global Grassland area 

### Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as stats

### Import the grassland dataset retrieved from the FAO stats

In [44]:
df=pd.read_csv('grassland_data.csv')
df.head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value
0,Land Cover,Afghanistan,Area from MODIS,Grassland,2001,1000 ha,23533.7602
1,Land Cover,Afghanistan,Area from MODIS,Grassland,2002,1000 ha,23597.6426
2,Land Cover,Afghanistan,Area from MODIS,Grassland,2003,1000 ha,23822.6478
3,Land Cover,Afghanistan,Area from MODIS,Grassland,2004,1000 ha,24072.0168
4,Land Cover,Afghanistan,Area from MODIS,Grassland,2005,1000 ha,24356.4181


### Remove the unnecessary columns

In [45]:
df1=df.drop(["Domain", "Element", "Item", "Unit"],axis=1)
df1.head()

Unnamed: 0,Area,Year,Value
0,Afghanistan,2001,23533.7602
1,Afghanistan,2002,23597.6426
2,Afghanistan,2003,23822.6478
3,Afghanistan,2004,24072.0168
4,Afghanistan,2005,24356.4181


### Drop the rows contain zero values

In [46]:
df2=df1[~(df[df.columns[3:]] == 0).any(axis=1)]


In [47]:
df2.head()

Unnamed: 0,Area,Year,Value
0,Afghanistan,2001,23533.7602
1,Afghanistan,2002,23597.6426
2,Afghanistan,2003,23822.6478
3,Afghanistan,2004,24072.0168
4,Afghanistan,2005,24356.4181


In [48]:
df2.describe()

Unnamed: 0,Year,Value
count,4180.0,4180.0
mean,2010.52823,15628.678359
std,5.765491,46365.376249
min,2001.0,0.0215
25%,2006.0,62.6213
50%,2011.0,889.2686
75%,2016.0,5449.37985
max,2020.0,288630.1122


### Encoding the character variables

In [49]:
from sklearn.preprocessing import LabelEncoder
def Encoder(df2):
    columnsToEncode = list(df2.select_dtypes(include=['category','object']))
    le=LabelEncoder()
    for feature in columnsToEncode:
        try:
            df2[feature] = le.fit_transform(df2[feature])
        except:
            print('Error encoding '+feature)
    return df

In [55]:
df2=Encoder(df2)

In [56]:
df2.head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value
0,0,0,0,0,2001,0,23533.7602
1,0,0,0,0,2002,0,23597.6426
2,0,0,0,0,2003,0,23822.6478
3,0,0,0,0,2004,0,24072.0168
4,0,0,0,0,2005,0,24356.4181


In [70]:
df3=df2.drop(["Domain", "Element", "Item", "Unit"],axis=1)
df3.tail()

Unnamed: 0,Area,Year,Value
4778,241,2016,30225.4824
4779,241,2017,29840.5651
4780,241,2018,29511.18
4781,241,2019,30022.9778
4782,241,2020,30862.4778


### Assigning the independent and dependent variables 

### Input variables

In [60]:
X=df3.iloc[:,0:2]
X.head()

Unnamed: 0,Area,Year
0,0,2001
1,0,2002
2,0,2003
3,0,2004
4,0,2005


### Output variable

In [61]:
y=df3.iloc[:,2]
y.head()

0    23533.7602
1    23597.6426
2    23822.6478
3    24072.0168
4    24356.4181
Name: Value, dtype: float64

### Fitting the model

In [62]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor().fit(X,y)

### Spliting data into training and testing data

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Model training

In [64]:
from sklearn.ensemble import RandomForestRegressor
model1=RandomForestRegressor().fit(X_train,y_train)

### Prediction

In [65]:
y_pred=model1.predict(X_test)
y_pred

array([0.00000000e+00, 2.19158092e+03, 0.00000000e+00, 1.82671000e-01,
       3.57658370e+01, 1.36912097e+04, 2.41581145e+03, 2.31690297e+05,
       8.74596375e+03, 9.41290637e+03, 2.41961216e+04, 2.51781148e+02,
       2.33766484e+04, 2.95314428e+03, 1.35629707e+02, 1.70975856e+04,
       0.00000000e+00, 9.12765895e+03, 1.65173819e+04, 1.10488990e+01,
       5.51813638e+02, 9.42493770e+02, 4.35207810e+01, 3.90393071e+03,
       4.05165941e+03, 1.77335607e+03, 2.88623500e+01, 3.43587004e+02,
       2.53523000e-01, 7.41151600e+00, 4.66151829e+03, 3.19217278e+04,
       0.00000000e+00, 9.84790276e+02, 2.58316584e+02, 2.49202242e+02,
       6.98628800e+00, 1.94301016e+04, 2.10112459e+03, 1.35533300e+00,
       5.59000000e-03, 2.46788848e+02, 1.01699512e+02, 3.17093610e+01,
       2.29521319e+05, 1.61852000e+00, 2.94578602e+02, 4.37186972e+04,
       0.00000000e+00, 5.73931200e+00, 8.32305867e+03, 1.35535916e+02,
       0.00000000e+00, 0.00000000e+00, 3.02536029e+03, 2.54370891e+02,
      

### Random number prediction
Below in the given array, first number is the code of a country that we got by Label encoder and the second number is depicting the year.

In [74]:
model1.predict([[9, 2030],[8, 2031],[15, 2032],[100, 2033],[150, 2034],[80, 2035],[17, 2036],[1, 2037],[20, 2038],[30, 2039],[35, 2040],[25, 2041],[22, 2042],[55, 2043],[65, 2044],[60, 2045],[70, 2046],[72, 2047],[88, 2048],[6, 2049],[12, 2050]])

array([8.56854652e+04, 1.43631300e+00, 1.79890205e+02, 8.67113335e+02,
       0.00000000e+00, 1.06511737e+02, 3.60081582e+02, 8.23871325e+02,
       2.22617536e+02, 7.66506800e+00, 2.76701702e+03, 1.77538478e+04,
       3.83211758e+03, 1.88876988e+02, 1.77335607e+03, 5.86151264e+03,
       3.77860450e+02, 5.04046426e+04, 9.43401216e+03, 2.92580000e-01,
       1.76356238e+05])

### Model score

In [66]:
model1.score(X_test, y_test)

0.9997490148382878

### Metrics to evaluate the model performance

In [67]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
print("R2 vale is:", r2_score(y_test, y_pred))
print("MSE is:", mean_squared_error(y_test, y_pred))
print("MAE is:", mean_absolute_error(y_test, y_pred))

R2 vale is: 0.9997490148382878
MSE is: 555271.8473604858
MAE is: 149.87309893521285
