# <h3><i>1. Load persisted models from disk<br> 2. Perform prediction<br> 3. Write the resulting predictions to disk.

In [2]:
import os
import subprocess
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pyspark import SparkContext
import operator
import warnings
import random
import math

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'

SparkContext.setSystemProperty('spark.executor.memory', '5g')

warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [23]:
unlabeled_data = "pre_processed_data/pre_processed_unlabeled.csv"
# inputModel = "saved_models/Random_Forest_Model_1000.pkl"
inputModel = "saved_models/Random_Forest_Model_final.pkl"
outputFile = "final_prediction_result/final_results_13.csv"

In [9]:
# all the features
features = ['sampling_id', 'loc_id', 'month', 'time', 'timeSin', 'timeCos', 'effort_hours', 'population_per_mile',\
            'housing_density', 'housing_vacant', 'OMERNIK_L3_ECOREGION', 'Average_Temp','Flowing_fresh_in',\
            'WetVeg_fresh_from', 'WetVeg_fresh_in', 'flowing_brackish_from', 'flowing_brackish_in',\
            'standing_brackish_from', 'standing_brackish_in', 'wetveg_brackish_from', 'wetveg_brackish_in',\
            'birdPresent']

# experimenting with few features
sampleFeatures = ['sampling_id', 'loc_id', 'month', 'time', 'timeSin', 'timeCos','effort_hours',\
                  'population_per_mile', 'housing_density', 'housing_vacant', 'OMERNIK_L3_ECOREGION',\
                  'Average_Temp']

# set of independent variables
independentVariables = ['loc_id', 'month', 'time', 'timeSin', 'timeCos','effort_hours',\
                        'population_per_mile', 'housing_density', 'housing_vacant', 'OMERNIK_L3_ECOREGION',\
                        'Average_Temp']

dependentVariable = 'birdPresent'

# <i>Load Unlabeled Data

In [10]:
birdDataPD = pd.read_csv(unlabeled_data, names = sampleFeatures)   

In [12]:
birdDataPD.head()

Unnamed: 0,sampling_id,loc_id,month,time,timeSin,timeCos,effort_hours,population_per_mile,housing_density,housing_vacant,OMERNIK_L3_ECOREGION,Average_Temp
0,S18238876,2829621,5,0.541667,-0.258819,-0.965926,0.333,-999.0,-999.0,-999.0,-999,-999
1,S18107686,2803720,4,0.583333,-0.5,-0.866025,0.167,-999.0,-999.0,-999.0,-999,-999
2,S16945485,1018735,2,0.666667,-0.866025,-0.5,0.733,-999.0,-999.0,-999.0,76,-999
3,S17150783,1018735,2,0.333333,0.866025,-0.5,0.75,-999.0,-999.0,-999.0,76,-999
4,S17342160,825761,3,0.333333,0.866025,-0.5,0.467,1949.2,1144.513853,0.228048,-999,7


In [13]:
birdDataPD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189501 entries, 0 to 189500
Data columns (total 12 columns):
sampling_id             189501 non-null object
loc_id                  189501 non-null int64
month                   189501 non-null int64
time                    189501 non-null float64
timeSin                 189501 non-null float64
timeCos                 189501 non-null float64
effort_hours            189501 non-null float64
population_per_mile     189501 non-null float64
housing_density         189501 non-null float64
housing_vacant          189501 non-null float64
OMERNIK_L3_ECOREGION    189501 non-null int64
Average_Temp            189501 non-null int64
dtypes: float64(7), int64(4), object(1)
memory usage: 17.3+ MB


# <i>Load persisted Random Forest Model

In [14]:
from sklearn.externals import joblib
RF_Model = joblib.load(inputModel) 

# <i>Perform final prediction using Random Forest Model

In [18]:
from sklearn.ensemble import RandomForestClassifier

birdDataPD['SAW_AGELAIUS_PHOENICEUS'] = RF_Model.predict(birdDataPD[independentVariables])

In [19]:
birdDataPD.head()

Unnamed: 0,sampling_id,loc_id,month,time,timeSin,timeCos,effort_hours,population_per_mile,housing_density,housing_vacant,OMERNIK_L3_ECOREGION,Average_Temp,SAW_AGELAIUS_PHOENICEUS
0,S18238876,2829621,5,0.541667,-0.258819,-0.965926,0.333,-999.0,-999.0,-999.0,-999,-999,0
1,S18107686,2803720,4,0.583333,-0.5,-0.866025,0.167,-999.0,-999.0,-999.0,-999,-999,0
2,S16945485,1018735,2,0.666667,-0.866025,-0.5,0.733,-999.0,-999.0,-999.0,76,-999,0
3,S17150783,1018735,2,0.333333,0.866025,-0.5,0.75,-999.0,-999.0,-999.0,76,-999,0
4,S17342160,825761,3,0.333333,0.866025,-0.5,0.467,1949.2,1144.513853,0.228048,-999,7,0


In [21]:
birdDataPD.rename(columns=lambda x: x.replace('sampling_id', 'SAMPLING_EVENT_ID'), inplace=True)

In [22]:
birdDataPD.head()

Unnamed: 0,SAMPLING_EVENT_ID,loc_id,month,time,timeSin,timeCos,effort_hours,population_per_mile,housing_density,housing_vacant,OMERNIK_L3_ECOREGION,Average_Temp,SAW_AGELAIUS_PHOENICEUS
0,S18238876,2829621,5,0.541667,-0.258819,-0.965926,0.333,-999.0,-999.0,-999.0,-999,-999,0
1,S18107686,2803720,4,0.583333,-0.5,-0.866025,0.167,-999.0,-999.0,-999.0,-999,-999,0
2,S16945485,1018735,2,0.666667,-0.866025,-0.5,0.733,-999.0,-999.0,-999.0,76,-999,0
3,S17150783,1018735,2,0.333333,0.866025,-0.5,0.75,-999.0,-999.0,-999.0,76,-999,0
4,S17342160,825761,3,0.333333,0.866025,-0.5,0.467,1949.2,1144.513853,0.228048,-999,7,0


# <i>Write predictions to disk

In [24]:
output_columns = ['SAMPLING_EVENT_ID', 'SAW_AGELAIUS_PHOENICEUS']
birdDataPD[output_columns].to_csv(outputFile, index=False)