/
random_forest_classify.py
executable file
·408 lines (325 loc) · 15.8 KB
/
random_forest_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
#!/usr/bin/env python
# Filename: random_forest_classify
"""
introduction:
authors: Huang Lingcao
email:huanglingcao@gmail.com
add time: 28 June, 2019
"""
import sys, os
from optparse import OptionParser
HOME = os.path.expanduser('~')
# Landuse_DL
codes_dir = HOME + '/codes/PycharmProjects/Landuse_DL'
sys.path.insert(0, codes_dir)
# sys.path.insert(0, os.path.join(codes_dir, 'datasets'))
sys.path.insert(0, os.path.join(codes_dir, 'planetScripts')) # for import function in planet_svm_classify.py
from planetScripts.planet_svm_classify import classify_pix_operation
from planetScripts.planet_svm_classify import get_output_name
from planetScripts.planet_svm_classify import read_whole_x_pixels
# path of DeeplabforRS
codes_dir2 = HOME + '/codes/PycharmProjects/DeeplabforRS'
sys.path.insert(0, codes_dir2)
import basic_src.basic as basic
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib # save and load model
from sklearn import model_selection
model_saved_path = "sk_rf_trained.pkl"
scaler_saved_path = "scaler_saved.pkl"
def example_rf():
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
features = pd.read_csv('temps.csv')
# print(features.head(5))
print('The shape of our features is:', features.shape)
# print(features.describe())
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)
# Display the first 5 rows of the last 12 columns
# print(features.iloc[:, 5:].head(5))
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features = features.drop('actual', axis=1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25,
random_state=42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
######## imputing missing values, or converting temporal variables into cyclical representations ########
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=1000, random_state=42)
# Train the model on training data
rf.fit(train_features, train_labels)
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
###### Visualizing a Single Decision Tree #########
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file='tree.dot', feature_names=feature_list, rounded=True, precision=1)
# Use dot file to create a graph
(graph,) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth=3)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file='small_tree.dot', feature_names=feature_list, rounded=True, precision=1)
(graph,) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
######## Variable Importances ############
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
######## Visualizations ################
import matplotlib.pyplot as plt
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation='vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance')
plt.xlabel('Variable')
plt.title('Variable Importances')
plt.show()
# Use datetime for creating date objects for plotting
import datetime
# Dates of training values
months = features[:, feature_list.index('month')]
days = features[:, feature_list.index('day')]
years = features[:, feature_list.index('year')]
# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in
zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]
# Dataframe with true values and dates
true_data = pd.DataFrame(data={'date': dates, 'actual': labels})
# Dates of predictions
months = test_features[:, feature_list.index('month')]
days = test_features[:, feature_list.index('day')]
years = test_features[:, feature_list.index('year')]
# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in
zip(years, months, days)]
# Convert to datetime objects
test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]
# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data={'date': test_dates, 'prediction': predictions})
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label='actual')
# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label='prediction')
plt.xticks(rotation='60')
plt.legend()
# Graph labels
plt.xlabel('Date')
plt.ylabel('Maximum Temperature (F)')
plt.title('Actual and Predicted Values')
plt.show()
# Make the data accessible for plotting
true_data['temp_1'] = features[:, feature_list.index('temp_1')]
true_data['average'] = features[:, feature_list.index('average')]
true_data['friend'] = features[:, feature_list.index('friend')]
# Plot all the data as lines
plt.plot(true_data['date'], true_data['actual'], 'b-', label='actual', alpha=1.0)
plt.plot(true_data['date'], true_data['temp_1'], 'y-', label='temp_1', alpha=1.0)
plt.plot(true_data['date'], true_data['average'], 'k-', label='average', alpha=0.8)
plt.plot(true_data['date'], true_data['friend'], 'r-', label='friend', alpha=0.3)
# Formatting plot
plt.legend()
plt.xticks(rotation='60')
# Lables and title
plt.xlabel('Date')
plt.ylabel('Maximum Temperature (F)')
plt.title('Actual Max Temp and Variables')
plt.show()
pass
class classify_pix_operation_rf(classify_pix_operation):
"""perform classify operation on raster images using random forest"""
def __init__(self):
super(classify_pix_operation_rf, self).__init__()
def __del__(self):
super(classify_pix_operation_rf, self).__del__() # Feel free not to call
pass
def train_rf_classifier(self, training_X, training_y):
'''
train random forest classifer
:param training_X: X array, an array of size [n_records, n_features(fields)]
:param training_y: y array, an array of size [n_records, 1 (class)]
:return: True if successful, Flase otherwise
'''
if self._classifier is None:
self._classifier = RandomForestClassifier(n_estimators=25)
else:
basic.outputlogMessage('warning, classifier already exist, this operation will replace the old one')
self._classifier = RandomForestClassifier(n_estimators=25) # LinearSVC() #SVC()
if os.path.isfile(scaler_saved_path) and self._scaler is None:
self._scaler = joblib.load(scaler_saved_path)
result = self._scaler.transform(training_X)
X = result.tolist()
elif self._scaler is not None:
result = self._scaler.transform(training_X)
X = result.tolist()
else:
X = training_X
basic.outputlogMessage('warning, no pre-processing of data before training')
y = training_y
basic.outputlogMessage('Training data set nsample: %d, nfeature: %d' % (len(X), len(X[0])))
# sub sample and make the class 0 and 1 balanced (have the same number)
basic.outputlogMessage('Number of sample before sub-sample: %d, class 0: %d, class 1: %d'%
(len(X),len(np.where(y==0)[0]),len(np.where(y==1)[0])))
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
X_rus, y_rus, id_rus = rus.fit_sample(X, y)
X = X_rus
y = y_rus
basic.outputlogMessage('Number of sample after sub-sample: %d, class 0: %d, class 1: %d'%
(len(X),len(np.where(y==0)[0]),len(np.where(y==1)[0])))
X_train = X
y_train = y
# # for test by hlc
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.95, random_state=0)
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)
# print('Parameters currently in use:\n')
# print(self._classifier.get_params())
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# Create the serach grid
search_grid = {'n_estimators': n_estimators}
# search_grid = {'n_estimators': n_estimators,
# 'max_features': max_features,
# 'max_depth': max_depth,
# 'min_samples_split': min_samples_split,
# 'min_samples_leaf': min_samples_leaf,
# 'bootstrap': bootstrap}
basic.outputlogMessage(str(search_grid))
clf = model_selection.GridSearchCV(RandomForestClassifier() , search_grid, cv=5,
scoring='f1_macro', n_jobs=-1, verbose=3)
clf.fit(X_train, y_train)
basic.outputlogMessage("Best parameters set found on development set:" + str(clf.best_params_))
basic.outputlogMessage("Grid scores on development set:\n")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
basic.outputlogMessage("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
# fit_model = self._classifier.fit(X,y)
# basic.outputlogMessage(str(fit_model))
# save the classification model
joblib.dump(clf, model_saved_path)
pass
def main(options, args):
basic.outputlogMessage('Is_preprocessing:' + str(options.ispreprocess))
basic.outputlogMessage('Is_training:' + str(options.istraining))
classify_obj = classify_pix_operation_rf()
if options.ispreprocess:
# preprocessing
input_tif = args[0]
if os.path.isfile(scaler_saved_path) is False:
# #read whole data set for pre-processing
X, _, _ = read_whole_x_pixels(input_tif)
classify_obj.pre_processing(X)
else:
basic.outputlogMessage('warning, scaled model already exist, skip pre-processing')
elif options.istraining:
# training
if options.polygon_train is None:
# read training data (make sure 'subImages', 'subLabels' is under current folder)
X, y = classify_obj.read_training_pixels_from_multi_images('subImages', 'subLabels')
else:
input_tif = args[0]
X, y = classify_obj.read_training_pixels_inside_polygons(input_tif, options.polygon_train)
if os.path.isfile(model_saved_path) is False:
classify_obj.train_rf_classifier(X, y)
else:
basic.outputlogMessage("warning, trained model already exist, skip training")
else:
# prediction
input_tif = args[0]
if options.output is not None:
output = options.output
else:
output = get_output_name(input_tif)
basic.outputlogMessage('staring prediction on image:' + str(input_tif))
classify_obj.prediction_on_a_image(input_tif, output, model_saved_path)
pass
if __name__ == "__main__":
usage = "usage: %prog [options] input_image "
parser = OptionParser(usage=usage, version="1.0 2019-1-4")
parser.description = 'Introduction: pixel-based image classification based on random forest classifier'
parser.add_option("-p", "--ispreprocess",
action="store_true", dest="ispreprocess", default=False,
help="to indicate the script will perform pre-processing, if this set, istraining will be ignored")
parser.add_option("-t", "--istraining",
action="store_true", dest="istraining", default=False,
help="to indicate the script will perform training process")
parser.add_option("-s", "--shape_train",
action="store", dest="polygon_train",
help="the shape file containing polygons for training")
parser.add_option("-o", "--output",
action="store", dest="output",
help="the output file path")
(options, args) = parser.parse_args()
if len(sys.argv) < 2:
parser.print_help()
sys.exit(2)
basic.setlogfile('RandomForest_log.txt')
main(options, args)