In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pyspark import SparkContext
import operator
import warnings
import random
import math

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'

SparkContext.setSystemProperty('spark.executor.memory', '5g')

warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
# inputFile = "pre_processed_data/pre_processed_sample_100000.csv"
inputFile = "pre_processed_data/pre_processed_labelled_original_13.csv"

# <i>Experimenting with features

In [3]:
# all the features
features = ['sampling_id', 'loc_id', 'month', 'time', 'timeSin', 'timeCos', 'effor_hours', 'population_per_mile',\
            'housing_density', 'housing_vacant', 'OMERNIK_L3_ECOREGION', 'Average_Temp','Flowing_fresh_in',\
            'WetVeg_fresh_from', 'WetVeg_fresh_in', 'flowing_brackish_from', 'flowing_brackish_in',\
            'standing_brackish_from', 'standing_brackish_in', 'wetveg_brackish_from', 'wetveg_brackish_in',\
            'birdPresent']

# experimenting with few features
sampleFeatures = ['sampling_id', 'loc_id', 'month', 'time', 'timeSin', 'timeCos','effor_hours',\
                  'population_per_mile', 'housing_density', 'housing_vacant', 'OMERNIK_L3_ECOREGION',\
                  'Average_Temp', 'birdPresent']

# set of independent variables
independentVariables = ['loc_id', 'month', 'time', 'timeSin', 'timeCos','effor_hours',\
                        'population_per_mile', 'housing_density', 'housing_vacant', 'OMERNIK_L3_ECOREGION',\
                        'Average_Temp']

dependentVariable = 'birdPresent'

In [4]:
birdDataPD = pd.read_csv(inputFile, names = sampleFeatures)    

FileNotFoundError: File b'pre_processed_data/pre_processed_labelled_original_10.csv' does not exist

In [None]:
birdDataPD.head()

In [None]:
birdDataPD.info()

# <i>Generating Training set and Testing set

In [None]:
from sklearn.cross_validation import train_test_split

X = birdDataPD[independentVariables]
y = birdDataPD[dependentVariable]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.95, random_state=0)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.head()

# <i>Grid Search for tunning hyperparameters

In [None]:
from sklearn.grid_search import GridSearchCV

# perform exhaustive search over specified parameter values for an estimator.
# perform cross-validation
# returns best estimator of best fitting model
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
def grid_search_best_model(estimator, params, X, y, n_jobs = -1, n_folds = 5, score_func = None, verbose = 0):
    if score_func:
        model = GridSearchCV(estimator, param_grid = params, cv = n_folds, n_jobs = n_jobs, scoring = score_func, verbose = verbose)
    else:
        model = GridSearchCV(estimator, param_grid = params, n_jobs = n_jobs, cv = n_folds, verbose = verbose)
    model.fit(X, y)
    best_model = model.best_estimator_
    return best_model

# verbose : Controls the verbosity: the higher, the more messages.
# cv : Determines the cross-validation splitting strategy.
# n_jobs : Number of jobs to run in parallel.
# param_grid : Dictionary with parameters names (string) as keys and lists of parameter settings to try as values

# <i>Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest Regression estimator
# The number of jobs to run in parallel for both fit and predict.
# If -1, then the number of jobs is set to the number of cores.
# Training 100 trees in Forest
RF_Regressor = RandomForestClassifier(n_estimators = 100, n_jobs=-1)

<i><h4>Grid Search takes a long time to find the best model</h4>
<h4>Make sure you comment out few parametrs in RF_parameters</h4><i>

In [None]:
# Define a grid of parameters over which to optimize the random forest
# We will figure out which number of trees is optimal
RF_parameters = {"min_samples_split": [2], #[3, 4, 5]
                  "n_estimators": [100], #[80, 90, 120, 150, 200] # no. of trees in forest
                  "max_features": ["auto"], #["sqrt", "log2", "None"] # If “auto”, then max_features = n_features.
                  "max_depth": [15] #[16, 17, 18, 19, 25]
                 }
RF_best = grid_search_best_model(RF_Regressor, RF_parameters, X_train, y_train, n_folds=5, score_func='mean_squared_error', verbose=3)

In [None]:
RF_reg = RF_best.fit(X_train, y_train)

<h3><i>Find Training and testing accuracy</h3>

In [None]:
RF_training_accuracy = RF_reg.score(X_train, y_train)
RF_test_accuracy = RF_reg.score(X_test, y_test)
print("Accuracy on training data: ", RF_training_accuracy)
print("Accuracy on test data: ", RF_test_accuracy)
# increase in depth increases the training accuracy but decreases the test accuracy 
# i.e data sticks too much to the data

In [None]:
RF_y_predict = RF_reg.predict(X_test)

In [None]:
RF_reg.predict_proba(X_test)[0:10]

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test ,RF_y_predict)

# <i>Writing model to disk

In [None]:
from sklearn.externals import joblib
joblib.dump(RF_reg, 'saved_models/Random_Forest_Model_10.pkl') 

<h3><i>Which are the most important features?</h3>

In [None]:
feature_importance_dict = dict(zip(list(independentVariables), RF_reg.feature_importances_))
# sorting features in descending order of importance
features_sorted_by_importance = sorted(feature_importance_dict.items(), key = operator.itemgetter(1), reverse = True)

# considering only top 7 features
featuresPD = pd.DataFrame(features_sorted_by_importance, columns=['Feature', 'Importance']).set_index('Feature')
featuresPD.plot(kind='barh')
plt.ylabel("Random Forest features")
plt.xlabel("Importance of features")
plt.title("Top features of Random Forest")

# <i>Actual vs Predicted Sigthings

In [None]:
plt.figure(1)
plt.figure(1).set_size_inches(18, 6)

plt.subplot(211)
plt.plot(y_test.reset_index()['birdPresent'])
plt.title("Actual Bird Sighting (small data)")

plt.subplot(212)
plt.plot(RF_y_predict)
plt.title("Predicted Sighting using Random Forest (small data)")

# Decision Tree Classification

<h3><i>It is not used in final prediction. It is just used to draw the decison tree. It would give an idea about the structure (attributes) of trees in Random Forest.</h3>

In [None]:
from sklearn import datasets,tree
from io import StringIO
import pydotplus 
from sklearn.tree import export_graphviz

In [None]:
d_tree = tree.DecisionTreeClassifier()
d_tree = d_tree.fit(X_train, y_train)

In [None]:
dot_data = StringIO()
dot_data = export_graphviz(d_tree, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("Decision_tree_graph.pdf")