# Forest Cover Type Prediction

https://www.kaggle.com/c/forest-cover-type-prediction

## Problem

Predict the forest cover type (the predominant kind of tree cover) from stricly cartographic varaibles (as opposed to remotely sensed data). 

## Libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import tensorflow as tf

# set seed for reproducibility
tf.set_random_seed(6)

  from ._conv import register_converters as _register_converters


## Helper Functions


In [2]:
TEST_FILENAME = 'test.csv'
TRAIN_FILENAME = 'train.csv'
SAMPLE_SUBMISSION_FILENAME = 'sample_submission.csv'

def get_data(filename, force_download=False):
    '''Download ZIP file and unpack the CSV'''
    
    urlpath = 'https://www.kaggle.com/c/3936/download/'+ filename + '.zip'
    filepath = './data/' + filename
    
    # load data via url 
    if force_download:
        # login to Kaggle and retrieve the data
        login_info = {'Username': 'username', 'Password': 'password'}
        r = requests.post(requests.get(urlpath), data=login_info, prefetch=False)
        # write data to local file 512 KB at a time
        f = open(filepath, 'w')
        for chunk in r.iter_content(chunk_size=512*1024): 
            if chunk:
                f.write(chunk)
        f.close()
    
    # load data from directory (default)
    return pd.read_csv(filepath)

In [None]:
def parse_data(header, data):
    return 

## Load Data

In [3]:
test = get_data(TEST_FILENAME)
train = get_data(TRAIN_FILENAME)
sample = get_data(SAMPLE_SUBMISSION_FILENAME)

FileNotFoundError: File b'./data/sample_submission.csv' does not exist

## EDA

In [19]:
# look at the first 5 observations
train.head(5)

Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,0,3,32,3.0,,323011,3854,481,1975,1,...,99.0,,99,,99,,99,,99,
1,1,2,26,,8.0,268131,2441,344,1981,1,...,,,1,,2,,2,,2,
2,2,1,16,,7.0,167581,754,143,1995,1,...,1.0,,2,,2,,2,,2,
3,3,4,44,5.0,,445071,5705,604,1980,1,...,,,2,,2,,99,,99,
4,4,4,43,,6.0,436161,5645,592,1958,1,...,,,1,,1,,1,,1,


In [20]:
# look at any random 5 observations
train.sample(5)

Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
16993,16993,1,16,5.0,,165061,909,178,1997,0,...,3.0,,3,,3,,3,,3,
513,513,4,41,,6.0,416191,4368,533,1976,1,...,2.0,,2,,2,,2,,2,
8310,8310,3,33,,7.0,337241,3669,457,1956,1,...,1.0,,99,,99,,99,,99,
15503,15503,1,16,4.0,,164011,905,177,1951,0,...,3.0,,3,,3,,3,,3,
9830,9830,3,33,,7.0,337121,3641,454,1958,1,...,2.0,,3,,3,,3,,3,


In [21]:
# look at the last 5 observations
train.tail(5)

Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
18250,18250,1,16,,7.0,167221,880,173,1965,0,...,2.0,,2,,2,,2,,2,
18251,18251,3,34,3.0,,343041,4100,511,1992,0,...,99.0,,99,,99,,99,,99,
18252,18252,3,34,,6.0,346191,4287,530,1980,0,...,1.0,,1,,1,,1,,1,
18253,18253,1,14,,6.0,146041,361,71,1978,0,...,1.0,,1,,1,,1,,2,
18254,18254,1,16,,6.0,166161,800,153,1984,1,...,1.0,,3,,3,,3,,3,


In [22]:
# describe the dataset
train.describe()

Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,LN1B,LN2_1,LN2_2,LN2_3,LN2_4,GN1,GN2,GN3,GN4,GN5
count,18255.0,18255.0,18255.0,5653.0,12602.0,18255.0,18255.0,18255.0,18255.0,18255.0,...,18255.0,18255.0,18255.0,18255.0,18255.0,14230.0,18255.0,18255.0,18255.0,18255.0
mean,9127.0,2.37146,28.558313,3.138864,6.793525,291360.681676,8030.5106,352.038346,1978.073185,0.537113,...,2.397316,2.14533,2.151575,2.830731,2.835881,5.590654,6.763079,7.421912,8.925883,8.817365
std,5269.908918,1.130523,9.822629,1.361434,0.769568,98126.483799,22061.608061,179.744071,14.740675,0.498634,...,1.231487,1.407641,1.407021,1.594319,1.592863,18.210791,20.537281,21.752528,24.523345,24.413247
min,0.0,1.0,11.0,1.0,6.0,111011.0,96.0,24.0,1917.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,4563.5,1.0,21.0,2.0,6.0,216071.0,949.0,178.0,1969.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,9127.0,2.0,31.0,3.0,7.0,313011.0,2902.0,354.0,1981.0,1.0,...,2.0,1.0,1.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0
75%,13690.5,3.0,34.0,4.0,7.0,348181.0,4609.0,523.0,1990.0,1.0,...,4.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0
max,18254.0,4.0,44.0,5.0,8.0,448051.0,99999.0,633.0,2001.0,1.0,...,4.0,5.0,5.0,5.0,5.0,99.0,99.0,99.0,99.0,99.0


In [23]:
# idenitfy missing values
train.isnull()

Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,False,False,False,False,True,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True
1,False,False,False,True,False,False,False,False,False,False,...,True,True,False,True,False,True,False,True,False,True
2,False,False,False,True,False,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True
3,False,False,False,False,True,False,False,False,False,False,...,True,True,False,True,False,True,False,True,False,True
4,False,False,False,True,False,False,False,False,False,False,...,True,True,False,True,False,True,False,True,False,True
5,False,False,False,True,False,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True
6,False,False,False,True,False,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True
7,False,False,False,False,True,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True
8,False,False,False,False,True,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True
9,False,False,False,True,False,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,True


In [24]:
len(train.isnull())

18255

## Model 