# W207 Final Project
### Kaggle Competition
[San Francisco Crime Statistics](https://www.kaggle.com/c/sf-crime)
  
###  Team Members
Chuck Bolin, Matthew Burke, Yun-Hui Fan

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import utils

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

### Data Loading
Load in training and test data to variables

In [2]:
train_all = pd.read_csv('../data/train.csv', delimiter=',', parse_dates=['Dates'])
# test_final contains data which will be submitted to kaggle after predicting categories
# Includes ID field not found in training data
# Does not include description, category or resolution fields in training data
test_final = pd.read_csv('../data/test.csv', delimiter=',')

Create columns for Year, Month, Day, Day of Week and Hour from `Dates` column

In [14]:
def dateAttributes(df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Dates']).year
    df['Month'] = pd.DatetimeIndex(df['Dates']).month
    df['Day'] = pd.DatetimeIndex(df['Dates']).day
    df['Hour'] = pd.DatetimeIndex(df['Dates']).hour
    return df

train_all = dateAttributes(train_all)

# Get all labels
labels_all = np.array(train_all["Category"])

# Get all data
train_data_all = np.array(train_all.drop(['Category'], axis=1))

# Define fraction of data to be used as test data
fraction = 0.33

# Split data into training and test data/labels randomly according to fraction specified
train_labels, test_labels, train_data, test_data = train_test_split(labels_all, train_data_all, test_size=fraction)

print 'Check that data has been formed correctly:\n'

print 'Training data shape: ', train_data.shape
print 'Training labels shape: ', len(train_labels)#train_labels.shape

print 'Test data shape: ', test_data.shape
print 'Test labels shape: ', len(test_labels)#test_labels.shape

print '\nTop 5 rows of training data:\n'

print train_all.head(5)

Check that data has been formed correctly:

Training data shape:  (588292, 12)
Training labels shape:  588292
Test data shape:  (289757, 12)
Test labels shape:  289757

Top 5 rows of training data:

                Dates        Category                      Descript  \
0 2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1 2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2 2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3 2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4 2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK 

## To Do
### Feature Engineering
* Resolution feature extraction
* Description feature extraction
* Address / X Y coordinate mapping to more specific locations than neighborhoods?
    - round to values (determine precision)
    - create XY as single string column
    - map to features
* Types of crimes from category (violent, non-violent)


### Model Selection
Test with models that can support predict_proba to predict probability of all categories

## Model Evaluation
### RandomForest Baseline

In [None]:
# Helper function to vectorizer text columns and create new data/vocabulary for training
# Maybe make it so it trains and produces predictions/accuracy as well?

# Make second function to do parameter optimization 

In [22]:
cols = (8, 9, 10, 11)
print dev_data[0,cols]

[2009 1 25 17]


In [23]:
# Columns to use in baseline classification

dev_data = train_data[0:100,]
dev_labels = train_labels[0:100]

dev_test_data = test_data[0:100,]
dev_test_labels = test_labels[0:100]

#vocab = CountVectorizer().fit(dev_test_labels).get_feature_names()

#dev_labels_vec = CountVectorizer(vocabulary=vocab).fit_transform(dev_labels).toarray()
#dev_test_labels_vec = CountVectorizer(vocabulary=vocab).fit_transform(dev_test_labels).toarray()


#dev_labels_num = [None]*len(dev_labels_vec)
#dev_test_labels_num = [None]*len(dev_test_labels_vec)

#for i in range(0,len(cats)):
#    dev_labels_num[i] = np.nonzero(dev_labels_vec[i,])[0][0]
#    dev_test_labels_num[i] = np.nonzero(dev_test_labels_vec[i,])[0]


# Super basic model using only premade numeric columns
# i.e. year, month, hour, day
cols = (8, 9, 10, 11)

rfcl = RandomForestClassifier(n_estimators=100)
print 'Begin training'
rfcl.fit(dev_data[:,cols], dev_labels)
print 'Completed training'

print 'Begin prediction'
pred_probs = rfcl.predict_proba(dev_test_data[:,cols])
pred = rfcl.predict(dev_test_data[:,cols])
#rfcl.predict_proba(dftest), index=dftest.index, columns=rfcl.classes_)

print 'Completed prediction\n'

print 'Log score: ', metrics.log_loss(pred, dev_test_labels), '\n'

print 'Predictions: ', pred[0]
print 'Prediction probabilities: ', pred_probs[0]




Begin training
Completed training
Begin prediction
Completed prediction

Log score: 

ValueError: could not convert string to float: LARCENY/THEFT

In [368]:
test_final = dateAttributes(test_final)
clf = RandomForestClassifier(n_estimators=20)
clf.fit(train_all, labels_all)

# make predictions, place in a data frame preserving indexing and columns
pred = pd.DataFrame(clf.predict_proba(test_final[:,(2, 8, 9, 10, 11)]), index=test_final.index, columns=labels_all.columns)
print 'Complete'
print pred.head(10)

ValueError: invalid literal for float(): 1800 Block of NEWCOMB AV

In [344]:
rfcl = RandomForestClassifier(n_estimators=100)
print 'Begin training'
rfcl.fit(train_data[:,cols], train_labels)
print 'Completed training'

print 'Begin prediction'
pred_probs = rfcl.predict_proba(test_data[:,cols])
pred = rfcl.predict(test_data[:,cols])
#rfcl.predict_proba(dftest), index=dftest.index, columns=rfcl.classes_)

print 'Completed prediction\n'

print 'Log score: ', metrics.log_loss(pred, test_labels), '\n'

print 'Predictions: ', pred[0]
print 'Prediction probabilities: ', pred_probs[0]


Begin training
Completed training
Begin prediction
Completed prediction

Log score:  1.48748568846 

Predictions:  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.]
Prediction probabilities:  [[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


In [353]:
total = 0
for i in range(0,len(pred)):
    total += sum(pred[i])
print total
print total / len(pred)

15957.0
0.0550702830303


In [367]:
pred_probs_array = np.array(pred_probs)
probs_total = 0
print pred_probs_array.shape
print pred_probs_array[0].shape
for i in range(0,39):
    for k in range(0,10):
        print 'First value: ', pred_probs[i][k][0]
        print 'Second value: ', pred_probs[i][k][1]

(39, 289757, 2)
(289757, 2)
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
First value:  0.88328397463
Second value:  0.11671602537
First value:  1.0
Second value:  0.0
First value:  0.945921294449
Second value:  0.0540787055506
First value:  1.0
Second value:  0.0
First value:  0.94611447686
Second value:  0.05388552314
First value:  0.890800326136
Second value:  0.109199673864
First value:  1.0
Second value:  0.0
First value:  0.950313304991
Second value:  0.0496866950092
First value:  0.918732017432
Second value:  0.0812679825678
First value:  0.999285714286
Second value:  0.000714285714286
First value:  1.0
Second value:  0.0
First value:  1.0
Second value:  0.0
Fi