# Submission Draft for San Francisco Crime Classification

https://www.kaggle.com/c/sf-crime

In [18]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

%reload_ext autoreload
%autoreload 2
from src.features.build_features import DataFrameSelector, SFCCTransformer, print_summary, prep_submissions, prep_data

import inspect

# Load Data

In [6]:
train_pd = pd.read_csv("../data/raw/train.csv.zip", compression="zip")
test_pd = pd.read_csv("../data/raw/test.csv.zip", compression="zip")
sample_submissions = pd.read_csv("../data/raw/sampleSubmission.csv.zip", compression="zip")

In [7]:
# extract some more features using our custom transformer
sfcc = SFCCTransformer()
pipe = Pipeline([
    ("transformer", sfcc)
])
train_pd = pipe.transform(train_pd)
test_pd = pipe.transform(test_pd)

In [19]:
train_data, train_labels, test_data, test_ids = prep_data(train_pd, test_pd, rs = 42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

#>>> X, y = make_classification(n_samples=1000, n_features=4,
#...                            n_informative=2, n_redundant=0,
#...                            random_state=0, shuffle=False)
#>>> clf = RandomForestClassifier(n_estimators=100, max_depth=2,
#...                              random_state=0)
#>>> clf.fit(X, y)
#RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#            max_depth=2, max_features='auto', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
#            oob_score=False, random_state=0, verbose=0, warm_start=False)
#>>> print(clf.feature_importances_)
#[0.14205973 0.76664038 0.0282433  0.06305659]
#>>> print(clf.predict([[0, 0, 0, 0]]))
#[1]


In [None]:
from sklearn.ensemble import RandomForestClassifier

# pipeline to prep our data and fit classifiers
selector = DataFrameSelector(["X", "Y"])
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, n_jobs=-1)


pipe = Pipeline([
    ("selector", selector)
    ,("clf", clf)
])

# TODO add more classifier types and attributes, use list of dicts for alt paths
# TODO figure out how to add ensembles to this, maybe with soft voting?
# TODO SVM, knn, random forest, etc
param_grid = {
    "selector__attribute_names": [
        , ["X", "Y", "is_latenight"]
#         ,["hour_of_day_sin", "hour_of_day_cos"]
#         ,["X", "Y", "hour_of_day_sin", "hour_of_day_cos"]
    ]
    ,"clf__n_estimators": [10, 50, 100]
    ,
    
}

# TODO figure out how to do stratified kfold by category
# TODO figure out how to add bagging to this
search = GridSearchCV(pipe, param_grid, iid = True, cv = 3, return_train_score = False, scoring ='neg_log_loss')

_ = search.fit(train_data, train_labels)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# pipeline to prep our data and fit classifiers
selector = DataFrameSelector(["X", "Y"])
clf = GradientBoostingClassifier(random_state=0)


pipe = Pipeline([
    ("selector", selector)
    ,("clf", clf)
])

# TODO add more classifier types and attributes, use list of dicts for alt paths
# TODO figure out how to add ensembles to this, maybe with soft voting?
# TODO SVM, knn, random forest, etc
param_grid = {
    "selector__attribute_names": [
        ["X", "Y"]
        , ["X", "Y", "is_latenight"]
#         ,["hour_of_day_sin", "hour_of_day_cos"]
#         ,["X", "Y", "hour_of_day_sin", "hour_of_day_cos"]
    ]
    #,"knn__n_neighbors": [3, 16, 26]
}

# TODO figure out how to do stratified kfold by category
# TODO figure out how to add bagging to this
search = GridSearchCV(pipe, param_grid, iid = True, cv = 3, return_train_score = False)

_ = search.fit(train_data, train_labels)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
from sklearn.neuralnetwork import MLPClassifier

# pipeline to prep our data and fit classifiers
selector = DataFrameSelector(["X", "Y"])
clf = MLPClassifier()


pipe = Pipeline([
    ("selector", selector)
    ,("clf", clf)
])

# TODO add more classifier types and attributes, use list of dicts for alt paths
# TODO figure out how to add ensembles to this, maybe with soft voting?
# TODO SVM, knn, random forest, etc
param_grid = {
    "selector__attribute_names": [
        ["X","Y","is_weekend", 'pdd_BAYVIEW', 'pdd_CENTRAL', 'pdd_INGLESIDE',
       'pdd_MISSION', 'pdd_NORTHERN', 'pdd_PARK', 'pdd_RICHMOND',
       'pdd_SOUTHERN', 'pdd_TARAVAL', 'pdd_TENDERLOIN', "is_late_night", "month_of_year"
#         ,["hour_of_day_sin", "hour_of_day_cos"]
#         ,["X", "Y", "hour_of_day_sin", "hour_of_day_cos"]
    ]
    #,"knn__n_neighbors": [3, 16, 26]
}

# TODO figure out how to do stratified kfold by category
# TODO figure out how to add bagging to this
search = GridSearchCV(pipe, param_grid, iid = True, cv = 3, return_train_score = False)

_ = search.fit(train_data, train_labels)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

# EDA

We have about 800k records in both train and test data sets. The train data set has the Category, Descript, and Resolution columns, which are missing from the test data set.

We will need to use the test data set to generate the submission to Kaggle.

TODO add more plots and EDA from scratch EDA

In [8]:
print(train_pd.shape)
print(test_pd.shape)

(878049, 142)
(884262, 141)


In [11]:
train_pd.head(2)

Unnamed: 0,Category,Descript,Resolution,X,Y,pdd_BAYVIEW,pdd_CENTRAL,pdd_INGLESIDE,pdd_MISSION,pdd_NORTHERN,...,day_of_month_sin,day_of_month_cos,day_of_year_sin,day_of_year_cos,week_of_year_sin,week_of_year_cos,month_of_year_sin,month_of_year_cos,quarter_of_year_sin,quarter_of_year_cos
0,WARRANTS,WARRANT ARREST,"ARREST, BOOKED",0.59,0.595,0,0,0,0,1,...,0.826,0.118,0.884,0.18,0.888,0.185,0.933,0.25,1.0,0.5
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",0.59,0.595,0,0,0,0,1,...,0.826,0.118,0.884,0.18,0.888,0.185,0.933,0.25,1.0,0.5


In [15]:
train_pd["Category"]

0                       WARRANTS
1                 OTHER OFFENSES
2                 OTHER OFFENSES
3                  LARCENY/THEFT
4                  LARCENY/THEFT
5                  LARCENY/THEFT
6                  VEHICLE THEFT
7                  VEHICLE THEFT
8                  LARCENY/THEFT
9                  LARCENY/THEFT
10                 LARCENY/THEFT
11                OTHER OFFENSES
12                     VANDALISM
13                 LARCENY/THEFT
14                  NON-CRIMINAL
15                  NON-CRIMINAL
16                       ROBBERY
17                       ASSAULT
18                OTHER OFFENSES
19                  NON-CRIMINAL
20                 LARCENY/THEFT
21                       ROBBERY
22                      WARRANTS
23                  NON-CRIMINAL
24                 LARCENY/THEFT
25                  NON-CRIMINAL
26                 LARCENY/THEFT
27                 LARCENY/THEFT
28                 LARCENY/THEFT
29                OTHER OFFENSES
          

In [11]:
test_pd.head(3)

Unnamed: 0,Id,X,Y,pdd_BAYVIEW,pdd_CENTRAL,pdd_INGLESIDE,pdd_MISSION,pdd_NORTHERN,pdd_PARK,pdd_RICHMOND,...,day_of_month_sin,day_of_month_cos,day_of_year_sin,day_of_year_cos,week_of_year_sin,week_of_year_cos,month_of_year_sin,month_of_year_cos,quarter_of_year_sin,quarter_of_year_cos
0,0,0.766,0.241,1,0,0,0,0,0,0,...,0.985,0.373,0.9,0.2,0.925,0.232,0.933,0.25,1.0,0.5
1,1,0.82,0.218,1,0,0,0,0,0,0,...,0.985,0.373,0.9,0.2,0.925,0.232,0.933,0.25,1.0,0.5
2,2,0.589,0.748,0,0,0,0,1,0,0,...,0.985,0.373,0.9,0.2,0.925,0.232,0.933,0.25,1.0,0.5


# Classification

In [102]:
# TODO add feature selection

In [12]:
# shuffles the train data
# note, we don't need a dev set since we are using cross validation
train_data = train_pd.sample(frac=1, random_state = 0)

print(train_data.shape)

# gets the train labels
train_labels = train_data["Category"]

print(train_labels.shape)

(878049, 142)
(878049,)


In [None]:
# pipeline to prep our data and fit classifiers
selector = DataFrameSelector(["X", "Y"])
km = KMeans(n_clusters = 39)

pipe = Pipeline([
    ("selector", selector)
    ,("km", km)
])

# TODO add more classifier types and attributes, use list of dicts for alt paths
# TODO figure out how to add ensembles to this, maybe with soft voting?
# TODO SVM, knn, random forest, etc
param_grid = {
    "selector__attribute_names": [
        ["X", "Y"]
        , ["X", "Y", "is_latenight"]
#         ,["hour_of_day_sin", "hour_of_day_cos"]
#         ,["X", "Y", "hour_of_day_sin", "hour_of_day_cos"]
    ]
}

# TODO figure out how to do stratified kfold by category
# TODO figure out how to add bagging to this
search = GridSearchCV(pipe, param_grid, iid = True, cv = 3, return_train_score = False)

_ = search.fit(train_data, train_labels)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [16]:
# makes predictions against our test data using our best classifier
predsproba = search.best_estimator_.predict_proba(test_pd)
print(predsproba[0:3])

AttributeError: 'KMeans' object has no attribute 'predict_proba'

In [106]:
# converts predicted probabilities into submission panda
submissions = prep_submissions(predsproba, train_pd.Category)

print(submissions.shape)
print(submissions.head(3))

# checks submission has the correct number of rows and columns
assert(sample_submissions.shape[0] == submissions.shape[0])
assert(sample_submissions.shape[1] == submissions.shape[1])

(884262, 40)
   Id  ARSON  ASSAULT  BAD CHECKS  BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0    0.0     0.12         0.0      0.0      0.00                 0.0   
1   1    0.0     0.15         0.0      0.0      0.00                 0.0   
2   2    0.0     0.04         0.0      0.0      0.19                 0.0   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS  ...  \
0                          0.0           0.04          0.0  ...   
1                          0.0           0.04          0.0  ...   
2                          0.0           0.00          0.0  ...   

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  SUICIDE  SUSPICIOUS OCC  TREA  \
0                        0.0              0.0      0.0            0.08   0.0   
1                        0.0              0.0      0.0            0.08   0.0   
2                        0.0              0.0      0.0            0.04   0.0   

   TRESPASS  VANDALISM  VEHICLE THEFT  WARRANTS  WEAPON LAWS  
0       0.0       0.12       

In [107]:
# save submissions to disk
submissions.to_csv("../data/processed/submission.csv.gz", index = False, compression = "gzip")

# Appendicies

## DataFrameSelector

In [108]:
lines = inspect.getsource(DataFrameSelector)
print(lines)

class DataFrameSelector(BaseEstimator, TransformerMixin): 
    """
    Simple helper class, meant make it easier to use Pandas 
    along with sklearn Pipeline. Create and initate with a 
    list of features, then when the pipeline transform function
    is called, will return a Numpy array of the features.
    
    See Chap 2 transformation pipelines
    
    Example:
        train_pd = pd.read_csv("data.csv")
        num_features = ["X", "Y"]
        num_pipeline = Pipeline([
            ("selector", DataFrameSelector(num_features))
        ])
        train_prepared = num_pipeline.transform(train_pd)
        
    """
    def __init__(self, attribute_names): 
        self.attribute_names = attribute_names 
        
    def fit(self, X, y = None): 
        return self 
    
    def transform(self, X): 
        return X[self.attribute_names].values



## SFCCTransformer

In [109]:
lines = inspect.getsource(SFCCTransformer)
print(lines)

class SFCCTransformer(BaseEstimator, TransformerMixin):
    """
    Helper class for our SanFrancisco Crime Classification project.
    
    Centralizes transformation logic, and make it easier to use
    transformations with Pandas, Pipeline, and gscv. Note, meant to transform
    Pandas into Pandas.
    
    Should use in conjunction with DataFrameSelector and one hot encoders.
    
    See Chap 2 custom transformers
    
    """
    def __init__(self, holiday_calendar = USFederalHolidayCalendar(), latitude_outlier = 50):
        self.holiday_calendar = holiday_calendar
        self.latitude_outlier = latitude_outlier
        
    def fit(self, X, y = None):
        return self # no fitting
    
    def transform(self, X, y = None):
        
        def add_delta(dtt, delta):
            """
            helper funciton, given a Series of dates, 
            returns Series of delta since the mininum date
            
            see Linda's baseline code
            """
            re

## prep_submissions

In [110]:
lines = inspect.getsource(prep_submissions)
print(lines)

def prep_submissions(predsproba, categories):
    """
    Helper function to prepare the raw predsproba array into a panda with the correct column headers and an index
    """
    cols = np.sort(pd.unique(categories))
    submissions = pd.DataFrame(data = predsproba, columns = cols)
    
    # rounds any floats to less precision
    submissions= submissions[cols].round(2)
    
    # adds an Id column
    idx = np.arange(0, len(predsproba))
    submissions.insert(loc = 0, column = "Id", value = idx.tolist())
    return(submissions)

