  # Python for Data Science Bootcamp
  ## K Nearest Neighbors

Import pandas and numpy packages

In [1]:
import numpy as np
import pandas as pd

Read the iris CSV file

In [2]:
iris = pd.read_csv('iris.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


Explore the data

In [3]:
print iris.info()
print iris.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
species         150 non-null int64
dtypes: float64(4), int64(1)
memory usage: 5.9 KB
None
       sepal_length  sepal_width  petal_length  petal_width     species
count    150.000000   150.000000    150.000000   150.000000  150.000000
mean       5.843333     3.054000      3.758667     1.198667    1.000000
std        0.828066     0.433594      1.764420     0.763161    0.819232
min        4.300000     2.000000      1.000000     0.100000    0.000000
25%        5.100000     2.800000      1.600000     0.300000    0.000000
50%        5.800000     3.000000      4.350000     1.300000    1.000000
75%        6.400000     3.300000      5.100000     1.800000    2.000000
max        7.900000     4.400000      6.900000     2.500000    2.000000


Create two variables for features and target.  Convert them into values.

In [4]:
iris_features = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
iris_target = iris.species.values

Scikit-learn has a function to split data into training and testing sets

In [5]:
from sklearn.cross_validation import train_test_split

Apply train_test_split to sample data

In [9]:
example_features = [[1,10],[2,20],[3,30],[4,40],[5,50]]
example_target = ['a','b','c','d','e']

f_train, f_test, t_train, t_test = train_test_split(example_features, example_target, test_size=0.20, random_state=0)

View results of train_test_split 

In [10]:
print 'Training Set'
print f_train
print t_train,'\n\n'
print 'Test Set'
print f_test
print t_test

Training Set
[[1, 10], [2, 20], [4, 40], [5, 50]]
['a', 'b', 'd', 'e'] 


Test Set
[[3, 30]]
['c']


Split the iris data into training set and test set

In [11]:
features_train, features_test, target_train, target_test = train_test_split(
        iris_features, iris_target, test_size=0.20, random_state=0)

In [12]:
print target_train.shape
print target_test.shape
print features_train.shape
print features_test.shape

(120,)
(30,)
(120, 4)
(30, 4)


Import the K Nearest Neighbors classifier from scikit-learn

In [13]:
from sklearn.neighbors import KNeighborsClassifier

Train the KNN classifier

In [14]:
model = KNeighborsClassifier(5,weights='distance').fit(features_train, target_train)

Compare predictions with actual results

In [15]:
print model.predict(features_test)
print target_test

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0]
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0]


Predict probabilities

In [16]:
model.predict_proba(features_test)

array([[ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  0.21628456,  0.78371544],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  0.58205246,  0.41794754],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.84820868,  0.15179132],
       [ 0.        ,  1.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.   

Score the model

In [17]:
model.score(features_test, target_test)

1.0

## K-Folds Cross-Validation

Import the KFold function from scikit-learn

In [18]:
from sklearn.cross_validation import KFold

KFold function creates indices for separating training and test sets.  Lets create indices for a data set of 3 records with 3 folds.

In [19]:
k_fold_indices = KFold(3, n_folds=3, shuffle=False)
print k_fold_indices

sklearn.cross_validation.KFold(n=3, n_folds=3, shuffle=False, random_state=None)


Let's see what is in the k_fold_indices

In [20]:
for x,y in k_fold_indices:
    print x,y

[1 2] [0]
[0 2] [1]
[0 1] [2]


Remember how to index numpy arrays with indices?

In [21]:
sample = np.array([0,10,20,30,40,50])
indices = [4,0,2]
print sample[indices]

[40  0 20]


Let's use the indices to index values in an array.

In [22]:
data = np.array(['a','b','c'])
for x,y in k_fold_indices:
    print data[x], data[y]

['b' 'c'] ['a']
['a' 'c'] ['b']
['a' 'b'] ['c']


Function for calculating cross-validation

In [23]:
# cross validation function with average score
def cross_validate(features, target, classifier, k_fold, r_state=None) :

    # derive a set of (random) training and testing indices
    k_fold_indices = KFold(len(features), n_folds=k_fold,
                           shuffle=True, random_state=r_state)

    k_score_total = 0
    
    # for each training and testing slices run the classifier, and score the results
    for train_indices, test_indices in k_fold_indices :

        model = classifier.fit(features[train_indices],
                           target[train_indices])

        k_score = model.score(features[test_indices],
                              target[test_indices])

        k_score_total += k_score

    # return the average accuracy
    return k_score_total/k_fold

In [24]:
k_fold_indices = KFold(20, 5, shuffle=True)

for train_indices, test_indices in k_fold_indices :
    print train_indices, ' - ',test_indices


[ 0  1  2  5  6  7  8  9 10 11 12 14 15 17 18 19]  -  [ 3  4 13 16]
[ 0  1  3  4  5  6  8 10 11 12 13 14 15 16 17 18]  -  [ 2  7  9 19]
[ 0  2  3  4  5  6  7  9 10 11 13 14 15 16 18 19]  -  [ 1  8 12 17]
[ 1  2  3  4  5  6  7  8  9 10 12 13 14 16 17 19]  -  [ 0 11 15 18]
[ 0  1  2  3  4  7  8  9 11 12 13 15 16 17 18 19]  -  [ 5  6 10 14]


Run cross-validation on the iris data set

In [25]:
print cross_validate(iris_features, iris_target, 
                     KNeighborsClassifier(3), 10, 0)
print cross_validate(iris_features, iris_target, 
                     KNeighborsClassifier(4,weights='distance'), 10, 0)
print cross_validate(iris_features, iris_target, 
                     KNeighborsClassifier(5,weights='distance'), 10, 0)
print cross_validate(iris_features, iris_target, 
                     KNeighborsClassifier(6), 10, 0)
print cross_validate(iris_features, iris_target, 
                     KNeighborsClassifier(7), 10, 0)

0.96
0.96
0.96
0.96
0.953333333333


## Exercise
#### 1. Load the clean_data.csv as a DataFrame

In [26]:
# Load data into DataFrame
data = pd.read_csv('clean_data.csv')

print data.info()
print data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null bool
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Pclass_1       891 non-null float64
Pclass_2       891 non-null float64
Pclass_3       891 non-null float64
dtypes: bool(1), float64(5), int64(5), object(4)
memory usage: 98.4+ KB
None
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name    Sex   Age  SibSp  \
0 

#### 2. Separate the data into features (limit to age, sex, pclass dummy variables) and target (Note: Make sure you convert the data to a numpy array by typing .values at end of DataFrame)

In [27]:
# Create features and target variables as numpy arrays
features = data[['Age','Sex','Pclass_1','Pclass_2','Pclass_3']].values

target = data['Survived'].values

#### 3. Build a KNN model and test the accuracy of the model using kfold cross-validation

In [28]:
cross_validate(features, target, KNeighborsClassifier(3), 10, 0)

0.75645443196004991

### Bonus: Create a for loop to test different numbers of neighbors

In [31]:
for k in range(1,120,5):
    model = KNeighborsClassifier(k, weights='distance')
    print cross_validate(features, target, model, 10, 0)

0.756529338327
0.786754057428
0.793483146067
0.799088639201
0.792347066167
0.795717852684
0.793508114856
0.793508114856
0.794631710362
0.794631710362
0.793508114856
0.791260923845
0.79013732834
0.787890137328
0.786766541823
0.784519350811
0.787890137328
0.786766541823
0.787890137328
0.787890137328
0.786766541823
0.785642946317
0.784519350811
0.786766541823


## Data Normalization

Calculate the mean and standard deviation

In [27]:
avg_age = data.Age.mean()
stdev_age = data.Age.std()

Subtract the mean and divide by the standard deviation

In [28]:
data['Age_norm'] = (data.Age - avg_age)/stdev_age

Create new features using normalized data

In [29]:
features_norm = data[['Age_norm','Sex','Pclass_1','Pclass_2','Pclass_3']].values

Test KNN on normalized data

In [30]:
for k in range(1,10):
    model = KNeighborsClassifier(k)
    print cross_validate(features_norm, target, model, 10, 0)

0.762059925094
0.797952559301
0.790149812734
0.80138576779
0.796903870162
0.801360799001
0.802496878901
0.80024968789
0.799138576779


##Random Forest

Import the Random Forest function

In [31]:
from sklearn.ensemble import RandomForestClassifier

Create an instance of a random forest classifier.  Random state is used to set random number generator for reproducible results

In [32]:
model = RandomForestClassifier(random_state=0)

Run the cross-validation function using the Random Forest algorithm

In [33]:
cross_validate(features, target, model, 10, 0)

0.80475655430711623

Investigate feature importances

In [34]:
print model.fit(features,target).feature_importances_

[ 0.4359133   0.40366222  0.05078753  0.02608483  0.08355212]


## Exercise

- Implement the random forest algorithm on the titanic data
- Review the random forest algorithm model parameters on sklearn 
- Adjust parameters to get the best model performance

In [35]:
features = data[['Age', 'Sex', 'Pclass_1','Pclass_2','Pclass_3']].values

In [36]:
model = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, 
                               min_samples_leaf=2,random_state=None)
cross_validate(features, target, model, 10)

0.80474406991260916

In [37]:
for n in [5,10,20,30,40,50,100]:
    model = RandomForestClassifier(n_estimators=n, max_depth=None, min_samples_split=2, 
                               min_samples_leaf=2)
    print cross_validate(features, target, model, 10)

0.799126092385
0.818214731586
0.818214731586
0.820449438202
0.812621722846
0.811498127341
0.812496878901


### Make a Prediction for Kaggle

Train the model using the best data available (in the case of the Titanic you'd use all the data)

In [38]:
features = data[['Age', 'Sex', 'Pclass_1','Pclass_2','Pclass_3']].values
target = data.Survived.values
model = RandomForestClassifier(random_state=0).fit(features,target)

Use the predict and predict_proba functions within the trained model to predict future events (remember to normalize data before entering it into the model - you'll have to subract the mean and divide by the standard deviation).

Create a DataFrame with the test.csv data

In [39]:
test_data = pd.read_csv('test.csv')
print test_data.head()
print test_data.info()
print test_data.describe()

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name



Clean text and missing values

In [40]:
test_data.Sex = test_data.Sex.replace(['male','female'],[True,False])
avg_age = test_data.Age.mean()
test_data.Age = test_data.Age.fillna(avg_age)

Convert Pclass to dummies and merge to data

In [41]:
pclass = pd.get_dummies(test_data.Pclass, prefix = 'Pclass')
test_data = pd.merge(test_data,pclass,left_index=True,right_index=True)

test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null bool
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Pclass_1       418 non-null float64
Pclass_2       418 non-null float64
Pclass_3       418 non-null float64
dtypes: bool(1), float64(5), int64(4), object(4)
memory usage: 42.9+ KB


Select features from test data and convert to numpy array

In [42]:
test_features = test_data[['Age','Sex','Pclass_1','Pclass_2','Pclass_3']].values

Create predictions

In [43]:
predictions =  model.predict(test_features)

Add Predictions as new column in DataFrame

In [44]:
test_data['Survived'] = predictions

Save as CSV (make sure you set index=False)

In [45]:
kaggle = test_data[['PassengerId','Survived']]
kaggle.to_csv('kaggle_titanic_submission.csv',index=False)