In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# Loading the dataset for Kyoto
kyoto = pd.read_csv('data/kyoto.csv')

In [3]:
# Loading the dataset for Liestal
liestal = pd.read_csv('data/liestal.csv')

In [4]:
# Loading the dataset for Washington DC
washingtondc = pd.read_csv('data/washingtondc.csv')

In [5]:
all_places = pd.concat([washingtondc, kyoto, liestal])

In [6]:
all_places.head()

Unnamed: 0,location,lat,long,alt,year,bloom_date,bloom_doy
0,washingtondc,38.88535,-77.038628,0,1921,1921-03-20,79
1,washingtondc,38.88535,-77.038628,0,1922,1922-04-07,97
2,washingtondc,38.88535,-77.038628,0,1923,1923-04-09,99
3,washingtondc,38.88535,-77.038628,0,1924,1924-04-13,104
4,washingtondc,38.88535,-77.038628,0,1925,1925-03-27,86


In [7]:
all_places.tail()

Unnamed: 0,location,lat,long,alt,year,bloom_date,bloom_doy
123,liestal,47.4814,7.730519,350,2017,2017-03-26,85
124,liestal,47.4814,7.730519,350,2018,2018-04-08,98
125,liestal,47.4814,7.730519,350,2019,2019-03-27,86
126,liestal,47.4814,7.730519,350,2020,2020-03-17,77
127,liestal,47.4814,7.730519,350,2021,2021-03-28,87


In [8]:
all_places.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1062 entries, 0 to 127
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    1062 non-null   object 
 1   lat         1062 non-null   float64
 2   long        1062 non-null   float64
 3   alt         1062 non-null   int64  
 4   year        1062 non-null   int64  
 5   bloom_date  1062 non-null   object 
 6   bloom_doy   1062 non-null   int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 66.4+ KB


In [9]:
all_places.describe()

Unnamed: 0,lat,long,alt,year,bloom_doy
count,1062.0,1062.0,1062.0,1062.0,1062.0
mean,36.883259,100.025243,76.696798,1640.898305,103.036723
std,4.084265,70.781205,102.032042,319.594611,7.90981
min,35.011983,-77.038628,0.0,812.0,74.0
25%,35.011983,135.676114,44.0,1421.25,98.0
50%,35.011983,135.676114,44.0,1713.5,104.0
75%,35.011983,135.676114,44.0,1932.75,108.0
max,47.4814,135.676114,350.0,2021.0,124.0


In [10]:
X = all_places.drop(['bloom_date', 'year'], axis=1)

In [11]:
X = pd.get_dummies(X, columns=['location'])

In [12]:
X = X.drop('bloom_doy', axis=1)

In [13]:
Y = all_places['bloom_doy']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
random_forest = RandomForestRegressor()

In [18]:
random_forest.fit(X_train, y_train)

RandomForestRegressor()

In [19]:
X_test

Unnamed: 0,lat,long,alt,location_kyoto,location_liestal,location_washingtondc
88,35.011983,135.676114,44,1,0,0
664,35.011983,135.676114,44,1,0,0
254,35.011983,135.676114,44,1,0,0
166,35.011983,135.676114,44,1,0,0
647,35.011983,135.676114,44,1,0,0
...,...,...,...,...,...,...
54,35.011983,135.676114,44,1,0,0
686,35.011983,135.676114,44,1,0,0
5,47.481400,7.730519,350,0,1,0
540,35.011983,135.676114,44,1,0,0


In [20]:
test_predictions = random_forest.predict(X_test)

In [21]:
test_predictions

array([104.17650571, 104.17650571, 104.17650571, 104.17650571,
       104.17650571, 104.17650571, 104.17650571, 100.60877111,
       104.17650571, 100.60877111, 104.17650571,  94.48491047,
       104.17650571, 104.17650571, 104.17650571, 104.17650571,
       104.17650571, 104.17650571,  94.48491047, 100.60877111,
       100.60877111, 104.17650571, 104.17650571, 104.17650571,
       104.17650571, 104.17650571, 104.17650571, 104.17650571,
       100.60877111, 100.60877111, 104.17650571, 104.17650571,
       104.17650571, 104.17650571, 104.17650571,  94.48491047,
       100.60877111, 104.17650571, 104.17650571, 100.60877111,
       104.17650571, 104.17650571, 104.17650571, 100.60877111,
       104.17650571, 100.60877111,  94.48491047, 104.17650571,
       104.17650571, 100.60877111, 100.60877111, 104.17650571,
       104.17650571, 104.17650571, 104.17650571, 104.17650571,
       104.17650571, 100.60877111, 104.17650571, 104.17650571,
       104.17650571, 104.17650571, 104.17650571, 104.17

In [22]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [23]:
MAE = mean_absolute_error(y_test, test_predictions)

In [24]:
MAE

5.559004892206691

In [25]:
MSE = mean_squared_error(y_test, test_predictions)

In [26]:
MSE

50.43977368013998

In [27]:
RMSE = np.sqrt(MSE)

In [28]:
RMSE

7.102096428530098

In [29]:
RMSE

7.102096428530098

In [30]:
next_years = [[38.0113,136.8848,45,0,1,0],[37.1348,140.7890,42,1,0,0],[46.1345,120.7890,82,0,0,1]]
random_forest.predict(next_years).round().astype(int)



array([102, 103,  99])

In [31]:
test_predictions = [round(pred) for pred in test_predictions]
avg_no_of_days = round(np.mean(test_predictions))

In [32]:
avg_no_of_days

103

In [33]:
print('The earliest predicted blossom date: ', min(test_predictions))

The earliest predicted blossom date:  94


In [34]:
print('The latest predicted blossom date: ', max(test_predictions))

The latest predicted blossom date:  104
