## Import libraries

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re
import math
import statsmodels
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
# from bokeh.io import show, output_file
# from bokeh.models import ColumnDataSource
# from bokeh.palettes import Spectral6
# from bokeh.plotting import figure
# from bokeh.transform import factor_cmap
from sklearn.ensemble import RandomForestClassifier
# https://github.com/scikit-learn-contrib/boruta_py
from boruta import BorutaPy # need to install using terminal - './scripts/install_package_python3.sh boruta'
%config InlineBackend.figure_format = 'retina'
sns.set(style="white")
from sklearn.preprocessing import LabelEncoder

## Import dataset

In [100]:
data = pd.read_csv('data/train.csv',parse_dates=['DateTime']).fillna('')

## Feature Engineering

In [101]:
data = data.drop('OutcomeSubtype',axis=1)

data['ColorCount'] = data['Color'].apply(lambda x: len(x.split('/')))
data['MixOrNot'] = data['Breed'].apply(lambda x: 'Mix' in x)
data['BreedCount'] = data['Breed'].apply(lambda x: len(x.split('/')))
data['SexPrefix'] = data['SexuponOutcome'].apply(lambda x: x.split(' ')[0])
data['DateTime_dayofweek'] = data['DateTime'].dt.dayofweek
data['DateTime_dayofyear'] = data['DateTime'].dt.dayofyear
data['DateTime_days_in_month'] = data['DateTime'].dt.days_in_month


In [102]:
# data = data.drop(['AnimalID','Name','DateTime','AgeuponOutcome'], axis=1)
print data

      AnimalID        Name            DateTime      OutcomeType AnimalType  \
0      A671945     Hambone 2014-02-12 18:22:00  Return_to_owner        Dog   
1      A656520       Emily 2013-10-13 12:44:00       Euthanasia        Cat   
2      A686464      Pearce 2015-01-31 12:28:00         Adoption        Dog   
3      A683430             2014-07-11 19:09:00         Transfer        Cat   
4      A667013             2013-11-15 12:52:00         Transfer        Dog   
5      A677334        Elsa 2014-04-25 13:04:00         Transfer        Dog   
6      A699218       Jimmy 2015-03-28 13:11:00         Transfer        Cat   
7      A701489             2015-04-30 17:02:00         Transfer        Cat   
8      A671784        Lucy 2014-02-04 17:17:00         Adoption        Dog   
9      A677747             2014-05-03 07:48:00         Adoption        Dog   
10     A668402             2013-12-05 15:50:00         Transfer        Cat   
11     A666320             2013-11-04 14:48:00         Adoption 

In [95]:
x = data.iloc[:,1:].values
print "Independent variables:\n", x 
y = data.iloc[:,0].values # Taking all rows and last column.
print "Dependent variable:\n", y

Independent variables:
[['Dog' 'Neutered Male' 'Shetland Sheepdog Mix' ..., 2 43 28]
 ['Cat' 'Spayed Female' 'Domestic Shorthair Mix' ..., 6 286 31]
 ['Dog' 'Neutered Male' 'Pit Bull Mix' ..., 5 31 31]
 ..., 
 ['Dog' 'Neutered Male' 'Old English Bulldog Mix' ..., 0 68 31]
 ['Cat' 'Intact Male' 'Domestic Shorthair Mix' ..., 6 117 30]
 ['Cat' 'Intact Male' 'Domestic Shorthair Mix' ..., 3 183 31]]
Dependent variable:
['Return_to_owner' 'Euthanasia' 'Adoption' ..., 'Adoption' 'Transfer'
 'Transfer']


In [96]:
#Checking datatypes
data.dtypes

OutcomeType               object
AnimalType                object
SexuponOutcome            object
Breed                     object
Color                     object
ColorCount                 int64
MixOrNot                    bool
BreedCount                 int64
SexPrefix                 object
DateTime_dayofweek         int64
DateTime_dayofyear         int64
DateTime_days_in_month     int64
dtype: object

## Imputing

In [97]:
imputer = Imputer(missing_values = 'NaN', strategy = "most_frequent", axis = 0) #Indicate the settings for Inputer function. strategy = mean / median / most_frequent.
imputer = imputer.fit(x[:,1:3]) #Fit imputer into matrix x, on columns where there is missing data.Taking index 1 and 2. 
x[:,1:3] = imputer.transform(x[:,1:3]) # Replace missing data by mean of the columns
print "Independent variables after replacing missing data:\n", x 

ValueError: could not convert string to float: Domestic Shorthair Mix

## Scaling

In [69]:
le = LabelEncoder() 
data = le.fit_transform(data[:,3])

TypeError: unhashable type

## Feature selection

In [None]:
data.drop(['AnimalID', 'OutcomeSubtype'], axis=1, inplace=True)
y = data[['OutcomeType']]
X = data.drop(['OutcomeType'], axis=1)
X, y = X.values, y.values.ravel()

In [None]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# define Boruta feature selection method
feat_selector = BorutaPy(rf,
                         n_estimators='auto',
                         verbose=2,
                         random_state=1,
                         max_iter=100)

# find all relevant features - 5 features should be selected
feat_selector.fit(X, y)

# check selected features - first 5 features are selected
feat_selector.support_

# check ranking of features
feat_selector.ranking_

# call transform() on X to filter it down to selected features
X_filtered = feat_selector.transform(X)
# print(feat_selector.support_)
# print(feat_selector.ranking_)
# print(X_filtered)

In [None]:
feature_importance_dict = {'feature_name': data.drop(['OutcomeType'], axis=1).columns.values,
                           'feature_ranking': feat_selector.ranking_,
                           'feature_selected': feat_selector.support_}

feature_importance_df = pd.DataFrame(data=feature_importance_dict)
feature_importance_df.to_csv('output/feature_importance_df.csv', index=False)
display(feature_importance_df)

In [71]:
data = pd.read_csv('/Users/Walter/Desktop/Programming/UDM - Machine Learning/Part 1 - Data Preprocessing/Data.csv')

In [72]:
print data.head()

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes


In [74]:
x = data.iloc[:,:-1].values
print x

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [77]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = "mean", axis = 0) #Indicate the settings for Inputer function. strategy = mean / median / most_frequent.
imputer = imputer.fit(x[:,1:3]) #Fit imputer into matrix x, on columns where there is missing data.Taking index 1 and 2. 
x[:,1:3] = imputer.transform(x[:,1:3])

In [79]:
le = LabelEncoder()
x[:,0] = le.fit_transform(x[:,0]) #Fit label encoder into first column