In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
pro_est_df = pd.read_csv('PRODUCTION ESTIMATES.csv')
pro_est_df

Unnamed: 0,REGION,DISTRICT,YEAR,CROP,AREA (HA),YIELD (MT/HA),PRODUCTION (MT)
0,WESTERN,SHAMA AHANTA EAST,2008,MAIZE,1099.0,1.15,1263.85
1,WESTERN,WEST AHANTA,2008,MAIZE,1433.0,1.19,1705.27
2,WESTERN,MPORHOR WASSA EAST,2008,MAIZE,2034.0,1.32,2684.88
3,WESTERN,WASSA WEST,2008,MAIZE,2204.0,1.20,2644.80
4,WESTERN,EAST NZEMA,2008,MAIZE,1550.0,1.38,2139.00
...,...,...,...,...,...,...,...
8231,NORTHERN REGION,MAMPRUGU MOAGDURI,2017,CASSAVA,900.0,7.95,7155.00
8232,NORTHERN REGION,YENDI,2017,CASSAVA,4790.0,13.59,65096.10
8233,NORTHERN REGION,MION,2017,CASSAVA,3831.0,6.96,26663.76
8234,NORTHERN REGION,TATALE SANGULI,2017,CASSAVA,5192.0,13.85,71909.20


In [3]:
pro_est_df = pro_est_df.fillna(0)

In [4]:
pro_est_df = pro_est_df.replace("                           -    ", 0)

In [5]:
pro_est_df = pro_est_df.replace(" -   ", 0)

In [6]:
pro_est_df = pro_est_df.replace(" - ", 0)

In [7]:
pro_est_df['AREA (HA)'] = pd.to_numeric(pro_est_df['AREA (HA)'])

In [8]:
pro_est_df['YIELD (MT/HA)'] = pd.to_numeric(pro_est_df['YIELD (MT/HA)'])

In [9]:
pro_est_df['PRODUCTION (MT)'] = pd.to_numeric(pro_est_df['PRODUCTION (MT)'])

In [10]:
pro_est_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8236 entries, 0 to 8235
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   REGION           8236 non-null   object 
 1   DISTRICT         8236 non-null   object 
 2   YEAR             8236 non-null   int64  
 3   CROP             8236 non-null   object 
 4   AREA (HA)        8236 non-null   float64
 5   YIELD (MT/HA)    8236 non-null   float64
 6   PRODUCTION (MT)  8236 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 450.5+ KB


## finding the correlation of the columns

In [11]:
corr = pro_est_df.corr()
corr

Unnamed: 0,YEAR,AREA (HA),YIELD (MT/HA),PRODUCTION (MT)
YEAR,1.0,-0.115683,0.059953,-0.013695
AREA (HA),-0.115683,1.0,0.140375,0.574871
YIELD (MT/HA),0.059953,0.140375,1.0,0.652473
PRODUCTION (MT),-0.013695,0.574871,0.652473,1.0


## Encoding the object columns; changing classes to numbers

In [12]:
pro_est_df.REGION.value_counts()

ASHANTI            1635
BRONG AHAFO        1254
EASTERN            1203
WESTERN             971
VOLTA               965
CENTRAL             756
UPPER WEST          707
UPPER EAST          637
NORTHERN REGION      78
GREATER ACCRA        30
Name: REGION, dtype: int64

In [13]:
pro_est_df = pro_est_df.replace('MAIZE ', 'MAIZE')
pro_est_df = pro_est_df.replace('RICE ', 'RICE')

In [14]:
pro_est_df.CROP.value_counts()

MAIZE        1369
CASSAVA      1184
YAM          1133
RICE         1065
PLANTAIN     1061
COCOYAM       994
COWPEA        434
GROUNDNUT     344
SORGHUM       262
SOYABEAN      208
MILLET        182
Name: CROP, dtype: int64

In [15]:
pro_est_df.YEAR.value_counts()

2017    1147
2016     923
2015     870
2013     868
2014     867
2012     856
2011     705
2010     702
2009     650
2008     648
Name: YEAR, dtype: int64

In [16]:
cleanup_nums = {"REGION": {"ASHANTI": 0, "BRONG AHAFO": 1, "EASTERN": 2, "WESTERN": 3,
                           "VOLTA": 4, "CENTRAL": 5, "UPPER WEST": 6,
                           "UPPER EAST": 7, "NORTHERN REGION": 8, "GREATER ACCRA": 9},
                "CROP": {"MAIZE": 0, "CASSAVA": 1, "YAM": 2, "RICE": 3,
                           "PLANTAIN": 4, "COCOYAM": 5, "COWPEA": 6,
                           "GROUNDNUT": 7, "SORGHUM": 8, "SOYABEAN": 9,"MILLET": 10},
                "YEAR": {2008: 0, 2009: 1, 2010: 2, 2011: 3,
                           2012: 4, 2013: 5, 2014: 6,
                           2015: 7, 2016: 8, 2017: 9}}
pro_est_df= pro_est_df.replace(cleanup_nums)

In [17]:
pro_est_df

Unnamed: 0,REGION,DISTRICT,YEAR,CROP,AREA (HA),YIELD (MT/HA),PRODUCTION (MT)
0,3,SHAMA AHANTA EAST,0,0,1099.0,1.15,1263.85
1,3,WEST AHANTA,0,0,1433.0,1.19,1705.27
2,3,MPORHOR WASSA EAST,0,0,2034.0,1.32,2684.88
3,3,WASSA WEST,0,0,2204.0,1.20,2644.80
4,3,EAST NZEMA,0,0,1550.0,1.38,2139.00
...,...,...,...,...,...,...,...
8231,8,MAMPRUGU MOAGDURI,9,1,900.0,7.95,7155.00
8232,8,YENDI,9,1,4790.0,13.59,65096.10
8233,8,MION,9,1,3831.0,6.96,26663.76
8234,8,TATALE SANGULI,9,1,5192.0,13.85,71909.20


In [18]:
pro_est_df = pro_est_df.drop('DISTRICT', axis = 1)

In [19]:
pro_est_df

Unnamed: 0,REGION,YEAR,CROP,AREA (HA),YIELD (MT/HA),PRODUCTION (MT)
0,3,0,0,1099.0,1.15,1263.85
1,3,0,0,1433.0,1.19,1705.27
2,3,0,0,2034.0,1.32,2684.88
3,3,0,0,2204.0,1.20,2644.80
4,3,0,0,1550.0,1.38,2139.00
...,...,...,...,...,...,...
8231,8,9,1,900.0,7.95,7155.00
8232,8,9,1,4790.0,13.59,65096.10
8233,8,9,1,3831.0,6.96,26663.76
8234,8,9,1,5192.0,13.85,71909.20


In [20]:
y = pro_est_df['YIELD (MT/HA)']
X = pro_est_df.drop(['YIELD (MT/HA)'], axis = 1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 14)

In [22]:
model = RandomForestRegressor()

In [23]:
model.fit(X_train, y_train)

RandomForestRegressor()

In [24]:
model.score(X_test, y_test)

0.987185013090853

In [25]:
model.predict(X_test)

array([2.971 , 0.8509, 7.1404, ..., 1.145 , 4.3261, 1.1347])

In [26]:
y_test

7984     2.97
7047     0.85
2597     7.00
3804     3.44
3372     6.50
        ...  
2262    16.20
6749     1.40
4015     1.25
3255     4.30
2212     1.12
Name: YIELD (MT/HA), Length: 1648, dtype: float64