# W207 Final Project
### Kaggle Competition
[San Francisco Crime Statistics](https://www.kaggle.com/c/sf-crime)
  
###  Team Members
Chuck Bolin, Matthew Burke, Yun-Hui Fan

In [14]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import utils

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
from sklearn.preprocessing import LabelEncoder

### Data Loading
Load in training and test data to variables

In [2]:
train_all = pd.read_csv('../data/train.csv', delimiter=',', parse_dates=['Dates'])
# test_final contains data which will be submitted to kaggle after predicting categories
# Includes ID field not found in training data
# Does not include description, category or resolution fields in training data
test_final = pd.read_csv('../data/test.csv', delimiter=',')

Create columns for Year, Month, Day, Day of Week and Hour from `Dates` column

In [84]:
def dateAttributes(df):
    df = df.copy()
    df['Year'] = pd.DatetimeIndex(df['Dates']).year
    df['Month'] = pd.DatetimeIndex(df['Dates']).month
    df['Day'] = pd.DatetimeIndex(df['Dates']).day
    df['Hour'] = pd.DatetimeIndex(df['Dates']).hour
    return df

# Extract elements of dates
train_all = dateAttributes(train_all)


# Column names to binarize:
bin_cols = ['Year', 'Month', 'Hour', 'DayOfWeek', 'PdDistrict']

# Binarize columns identified above and add to dataframe
for column in bin_cols:
    dummies = pd.get_dummies(train_all[column])
    train_all[dummies.columns] = dummies

# Encode all categories into integers
encoder = LabelEncoder()
labels_all = encoder.fit_transform(train_all['Category'])

#Extract categories names from encoder for adding back into final output csv file
categories = encoder.classes_

# Get all data
train_data_all = np.array(train_all.drop(['Category', 'Dates'], axis=1))

# Define fraction of data to be used as test data
fraction = 0.33

# Split data into training and test data/labels randomly according to fraction specified
train_labels, test_labels, train_data, test_data = train_test_split(labels_all, train_data_all, test_size=fraction)


print 'Check that data has been formed correctly:\n'

print 'Training data shape: ', train_data.shape
print 'Training labels shape: ', len(train_labels)#train_labels.shape

print 'Test data shape: ', test_data.shape
print 'Test labels shape: ', len(test_labels)#test_labels.shape

print '\nTop row of training data:\n'

print train_data [0,]

Check that data has been formed correctly:

Training data shape:  (588292, 65)
Training labels shape:  588292
Test data shape:  (289757, 65)
Test labels shape:  289757

Top row of training data:

['VIOLATION OF MUNICIPAL POLICE CODE' 'Monday' 'TENDERLOIN' 'ARREST, CITED'
 'JONES ST / GOLDEN GATE AV' -122.412224164736 37.7820729312029 2008 12 8 6
 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0]


In [80]:
colNames = train_all.columns[2:]
print colNames

Index([  u'Descript',  u'DayOfWeek', u'PdDistrict', u'Resolution',
          u'Address',          u'X',          u'Y',       u'Year',
            u'Month',        u'Day',       u'Hour',          2003,
                2004,          2005,          2006,          2007,
                2008,          2009,          2010,          2011,
                2012,          2013,          2014,          2015,
                   1,             2,             3,             4,
                   5,             6,             7,             8,
                   9,            10,            11,            12,
                   0,            13,            14,            15,
                  16,            17,            18,            19,
                  20,            21,            22,            23,
           u'Friday',     u'Monday',   u'Saturday',     u'Sunday',
         u'Thursday',    u'Tuesday',  u'Wednesday',    u'BAYVIEW',
          u'CENTRAL',  u'INGLESIDE',    u'MISSION',   u'NORTHE

## To Do
### Feature Engineering
* Resolution feature extraction
* Description feature extraction
* Address / X Y coordinate mapping to more specific locations than neighborhoods?
    - round to values (determine precision)
    - create XY as single string column
    - map to features
* Types of crimes from category (violent, non-violent)


### Model Selection
Test with models that can support predict_proba to predict probability of all categories

## Model Evaluation
### RandomForest Baseline

In [None]:
# Helper function to vectorizer text columns and create new data/vocabulary for training
# Maybe make it so it trains and produces predictions/accuracy as well?

# Make second function to do parameter optimization 

In [77]:
# Columns to use in baseline classification
cols = range(11, len(colNames))

dev_data = train_data[0:10000,]
dev_labels = train_labels[0:10000]

dev_test_data = test_data[0:10000,]
dev_test_labels = test_labels[0:10000]

lgr = LogisticRegression()
print 'Begin training'
lgr.fit(dev_data[:,cols], dev_labels)
print 'Completed training'

print 'Begin prediction'
pred_probs = lgr.predict_proba(dev_test_data[:,cols])

print 'Completed prediction\n'

print 'Log score: ', metrics.log_loss(dev_test_labels, pred_probs), '\n'

print 'Probability prediction examples: ', pred_probs[0,]

Begin training
Completed training
Begin prediction
Completed prediction

Log score:  2.63675968946 

Probability prediction examples:  [ 0.00226787  0.06127294  0.00037161  0.00049842  0.07778822  0.00438893
  0.00261177  0.02516851  0.00057556  0.00102489  0.00221792  0.00039629
  0.01265475  0.01571769  0.00079513  0.00142895  0.22935539  0.00225922
  0.00089186  0.00910377  0.1185118   0.13201473  0.00088252  0.00258155
  0.01032715  0.00430221  0.01273573  0.00558163  0.00053202  0.00286842
  0.00064252  0.01896952  0.01009124  0.08776793  0.11717216  0.02110211
  0.00312706]


In [None]:
test_final = dateAttributes(test_final)

In [94]:
# Column names to binarize:
bin_cols = ['Year', 'Month', 'Hour', 'DayOfWeek', 'PdDistrict']

# Binarize columns identified above and add to dataframe
for column in bin_cols:
    print 'Starting ', column
    dummies = pd.get_dummies(test_final[column])
    test_final[dummies.columns] = dummies
    print 'Finished ', column

ids = test_final['Id']

test_final_data = np.array(test_final)

Starting  Year
Finished  Year
Starting  Month
Finished  Month
Starting  Hour
Finished  Hour
Starting  DayOfWeek
Finished  DayOfWeek
Starting  PdDistrict
Finished  PdDistrict


In [97]:
print test_final_data[0,range(12,len(test_final.columns))]

[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]


In [98]:
lgrFinal = LogisticRegression()

print 'Beginning training'
lgrFinal.fit(train_data_all[:,cols], labels_all)
print 'Completed training'

Beginning training
Completed training


In [101]:
print 'Begin prediction'
pred_probs = lgrFinal.predict_proba(test_final_data[:,range(11,len(test_final.columns))])
print 'Completed prediction'

Begin prediction
Completed training


NameError: name 'cat' is not defined

In [116]:
output = pd.DataFrame(pred_probs, columns=categories)
file_name = '../data/matthew_submission.csv'
print 'Output to file: ', file_name
output.to_csv(file_name, index=True, index_label='Id')
print 'File creation complete'

Output to file:  ../data/matthew_submission.csv
File creation complete


In [118]:
print output.shape

(884262, 39)
