# Submission 01 for San Francisco Crime Classification
Yang Yang Qian

https://www.kaggle.com/c/sf-crime

In [96]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

%reload_ext autoreload
%autoreload 2
from src.features.build_features import DataFrameSelector, SFCCTransformer, print_summary, prep_submissions

import inspect

# Load Data

In [97]:
train_pd = pd.read_csv("../data/raw/train.csv.zip", compression="zip")
test_pd = pd.read_csv("../data/raw/test.csv.zip", compression="zip")
sample_submissions = pd.read_csv("../data/raw/sampleSubmission.csv.zip", compression="zip")

In [98]:
# extract some more features using our custom transformer
sfcc = SFCCTransformer()
pipe = Pipeline([
    ("transformer", sfcc)
])
train_pd = pipe.transform(train_pd)
test_pd = pipe.transform(test_pd)

# EDA

We have about 800k records in both train and test data sets. The train data set has the Category, Descript, and Resolution columns, which are missing from the test data set.

We will need to use the test data set to generate the submission to Kaggle.

TODO add more plots and EDA from scratch EDA

In [99]:
print(train_pd.shape)
print(test_pd.shape)

(878049, 31)
(884262, 29)


In [100]:
train_pd.head(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,hour_delta,...,year,is_weekend,is_holiday,hour_of_day_sin,hour_of_day_cos,day_of_week_sin,day_of_week_cos,month_of_year_sin,month_of_year_cos,is_latenight
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,108263,...,2015,0,0,-0.259,0.966,0.975,-0.223,0.866,-0.5,1
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,108263,...,2015,0,0,-0.259,0.966,0.975,-0.223,0.866,-0.5,1
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,108263,...,2015,0,0,-0.259,0.966,0.975,-0.223,0.866,-0.5,1


In [101]:
test_pd.head(3)

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,hour_delta,day_delta,week_delta,...,year,is_weekend,is_holiday,hour_of_day_sin,hour_of_day_cos,day_of_week_sin,day_of_week_cos,month_of_year_sin,month_of_year_cos,is_latenight
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051,108311,4512,644,...,2015,1,0,-0.259,0.966,-0.782,0.623,0.866,-0.5,1
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432,108311,4512,644,...,2015,1,0,-0.259,0.966,-0.782,0.623,0.866,-0.5,1
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212,108311,4512,644,...,2015,1,0,-0.259,0.966,-0.782,0.623,0.866,-0.5,1


# Classification

In [102]:
# TODO add feature selection

In [103]:
# shuffles the train data
# note, we don't need a dev set since we are using cross validation
train_data = train_pd.sample(frac=1, random_state = 0)

print(train_data.shape)

# gets the train labels
train_labels = train_data["Category"]

print(train_labels.shape)

(878049, 31)
(878049,)


In [104]:
# pipeline to prep our data and fit classifiers
selector = DataFrameSelector(["X", "Y"])
knn = KNeighborsClassifier()


pipe = Pipeline([
    ("selector", selector)
    ,("knn", knn)
])

# TODO add more classifier types and attributes, use list of dicts for alt paths
# TODO figure out how to add ensembles to this, maybe with soft voting?
# TODO SVM, knn, random forest, etc
param_grid = {
    "selector__attribute_names": [
        ["X", "Y"]
        , ["X", "Y", "is_latenight"]
#         ,["hour_of_day_sin", "hour_of_day_cos"]
#         ,["X", "Y", "hour_of_day_sin", "hour_of_day_cos"]
    ]
    ,"knn__n_neighbors": [3, 16, 26]
}

# TODO figure out how to do stratified kfold by category
# TODO figure out how to add bagging to this
search = GridSearchCV(pipe, param_grid, iid = True, cv = 3, return_train_score = False)

_ = search.fit(train_data, train_labels)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.270):
{'knn__n_neighbors': 26, 'selector__attribute_names': ['X', 'Y']}


In [105]:
# makes predictions against our test data using our best classifier
predsproba = search.best_estimator_.predict_proba(test_pd)
print(predsproba[0:3])

[[0.         0.11538462 0.         0.         0.         0.
  0.         0.03846154 0.         0.03846154 0.         0.
  0.         0.03846154 0.         0.         0.         0.
  0.         0.19230769 0.         0.07692308 0.         0.
  0.         0.         0.03846154 0.         0.         0.
  0.         0.         0.07692308 0.         0.         0.11538462
  0.15384615 0.11538462 0.        ]
 [0.         0.15384615 0.         0.         0.         0.
  0.         0.03846154 0.         0.         0.         0.
  0.         0.         0.         0.         0.15384615 0.03846154
  0.         0.03846154 0.         0.46153846 0.         0.
  0.         0.03846154 0.         0.         0.         0.
  0.         0.         0.07692308 0.         0.         0.
  0.         0.         0.        ]
 [0.         0.03846154 0.         0.         0.19230769 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.23076923 0.
  0.        

In [106]:
# converts predicted probabilities into submission panda
submissions = prep_submissions(predsproba, train_pd.Category)

print(submissions.shape)
print(submissions.head(3))

# checks submission has the correct number of rows and columns
assert(sample_submissions.shape[0] == submissions.shape[0])
assert(sample_submissions.shape[1] == submissions.shape[1])

(884262, 40)
   Id  ARSON  ASSAULT  BAD CHECKS  BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0    0.0     0.12         0.0      0.0      0.00                 0.0   
1   1    0.0     0.15         0.0      0.0      0.00                 0.0   
2   2    0.0     0.04         0.0      0.0      0.19                 0.0   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS  ...  \
0                          0.0           0.04          0.0  ...   
1                          0.0           0.04          0.0  ...   
2                          0.0           0.00          0.0  ...   

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  SUICIDE  SUSPICIOUS OCC  TREA  \
0                        0.0              0.0      0.0            0.08   0.0   
1                        0.0              0.0      0.0            0.08   0.0   
2                        0.0              0.0      0.0            0.04   0.0   

   TRESPASS  VANDALISM  VEHICLE THEFT  WARRANTS  WEAPON LAWS  
0       0.0       0.12       

In [107]:
# save submissions to disk
submissions.to_csv("../data/processed/submission.csv.gz", index = False, compression = "gzip")

# Appendicies

## DataFrameSelector

In [108]:
lines = inspect.getsource(DataFrameSelector)
print(lines)

class DataFrameSelector(BaseEstimator, TransformerMixin): 
    """
    Simple helper class, meant make it easier to use Pandas 
    along with sklearn Pipeline. Create and initate with a 
    list of features, then when the pipeline transform function
    is called, will return a Numpy array of the features.
    
    See Chap 2 transformation pipelines
    
    Example:
        train_pd = pd.read_csv("data.csv")
        num_features = ["X", "Y"]
        num_pipeline = Pipeline([
            ("selector", DataFrameSelector(num_features))
        ])
        train_prepared = num_pipeline.transform(train_pd)
        
    """
    def __init__(self, attribute_names): 
        self.attribute_names = attribute_names 
        
    def fit(self, X, y = None): 
        return self 
    
    def transform(self, X): 
        return X[self.attribute_names].values



## SFCCTransformer

In [109]:
lines = inspect.getsource(SFCCTransformer)
print(lines)

class SFCCTransformer(BaseEstimator, TransformerMixin):
    """
    Helper class for our SanFrancisco Crime Classification project.
    
    Centralizes transformation logic, and make it easier to use
    transformations with Pandas, Pipeline, and gscv. Note, meant to transform
    Pandas into Pandas.
    
    Should use in conjunction with DataFrameSelector and one hot encoders.
    
    See Chap 2 custom transformers
    
    """
    def __init__(self, holiday_calendar = USFederalHolidayCalendar(), latitude_outlier = 50):
        self.holiday_calendar = holiday_calendar
        self.latitude_outlier = latitude_outlier
        
    def fit(self, X, y = None):
        return self # no fitting
    
    def transform(self, X, y = None):
        
        def add_delta(dtt, delta):
            """
            helper funciton, given a Series of dates, 
            returns Series of delta since the mininum date
            
            see Linda's baseline code
            """
            re

## prep_submissions

In [110]:
lines = inspect.getsource(prep_submissions)
print(lines)

def prep_submissions(predsproba, categories):
    """
    Helper function to prepare the raw predsproba array into a panda with the correct column headers and an index
    """
    cols = np.sort(pd.unique(categories))
    submissions = pd.DataFrame(data = predsproba, columns = cols)
    
    # rounds any floats to less precision
    submissions= submissions[cols].round(2)
    
    # adds an Id column
    idx = np.arange(0, len(predsproba))
    submissions.insert(loc = 0, column = "Id", value = idx.tolist())
    return(submissions)

