In [1]:
%matplotlib inline

# General libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

# SK-learn libraries for evaluation.
from sklearn import metrics

# true division for integers in 2.7
from __future__ import division



In [2]:
# import training dataset, drop unwanted columns, and exclude rows with location data outside range

df = pd.read_csv('train.csv', parse_dates=[0])
df = df.drop(['Descript', 'Resolution'], axis=1)
df = df[df.X < -121]
df = df[df.Y < 80]
df.Category = df.Category.astype('category')
df['Cat_codes'] = df.Category.cat.codes
df_train = df
print df_train.shape


(877982, 8)


In [3]:
#import test data - parsing dates in column 1
df_test = pd.read_csv('test.csv', parse_dates=[1])
df_test.shape

(884262, 7)

In [4]:
#import SF school data
df_sch = pd.read_csv('schools.csv')
df_sch.columns

Index([u'Campus Name', u'CCSF Entity', u'Lower Grade', u'Upper Grade',
       u'Grade Range', u'Category', u'Map Label', u'Lower Age', u'Upper Age',
       u'General Type', u'CDS Code', u'Campus Address', u'Supervisor District',
       u'County FIPS', u'County Name', u'Location 1'],
      dtype='object')

In [5]:
# Create dataframe of unique addresses

df_addr = df.drop_duplicates(subset = 'Address')
df_addr = df_addr[['Address', 'X', 'Y']]
df_addr = df_addr.reset_index(drop=True)
df_addr.head()
print df_addr.shape

(23191, 3)


### Add column to training data that adds True / False proximity to school based on distance

In [6]:
# Strip X Y data from school dataset
def school_proximity(dist, df):
    """add column to training data based on proximity to school"""
    df_sch = pd.read_csv('schools.csv')
    df_sch['Y'] = df_sch['Location 1'].str.findall('\d\d\.\d+').str.get(0).astype('float64')
    df_sch['X'] = df_sch['Location 1'].str.findall('\-\d{3}\.\d+').str.get(0).astype('float64')
    df_sch = df_sch[['Campus Name','X','Y']]

    # create list of closest distance of each address to any school 
    closest_dist = []

    for i in range(len(df_addr)):
        df_sch['sch_dist'] = ((((df_addr.X[i]-df_sch.X)*88000)**2) + ((df_addr.Y[i]-df_sch.Y)*111000)**2)**0.5
        closest_dist.append(min(df_sch.sch_dist))
    
    # add closest distance to df_addr dataframe
    df_addr['closest_dist'] = closest_dist
    #print df_addr.head()

    # filter based on 100m distance
    df_addr['by_school'] = df_addr.closest_dist < dist

    # create dataframe with addresses and true false proximity to school
    newgroup = df_addr[['Address','by_school']]
    newgroup[newgroup.by_school == True]
    
    # create a map and map to df
    mapper = newgroup.set_index('Address')['by_school']
    df['by_school'] = df['Address'].map(mapper)


### Create grouping of addresses based on volume of crime at particular address

In [7]:
def group_by_vol(num_group, df):
    """Create groups of addresses based on total volume of crime"""
    df_addr_vol = df_train.Category.groupby(df_train.Address).value_counts().unstack(level=0).fillna(value=0).T
    x = df_addr_vol.values
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_addr_vol_sc = pd.DataFrame(x_scaled)
    #print df_addr_vol_sc.shape
    df_addr_vol_sc.head(6)
    
    agclv = AgglomerativeClustering(n_clusters=num_group)
    addr_grp_vol = agclv.fit_predict(df_addr_vol_sc)
    
    df_addr_vol['Addr_Group_V'] = addr_grp_vol
    df_addr_vol['Address'] = df_addr_vol.index
    newgroupv = df_addr_vol[['Address','Addr_Group_V']]
    
    # create a map and map to df
    mapper = newgroupv.set_index('Address')['Addr_Group_V']
    df['Addr_Group_V'] = df['Address'].map(mapper)
    

### Create grouping of addresses based on ratio of crime type at particular address

In [8]:
def group_by_ratio(num_group, df):
    """Group addresses by volume of crime"""
    df_addr_ratio = df_train.Category.groupby(df_train.Address).value_counts().unstack(level=0).fillna(value=0).T
    df_addr_ratio = (df_addr_ratio.T / df_addr_ratio.T.sum()).T
    df_addr_ratio.head(6)
    
    agclr = AgglomerativeClustering(n_clusters = num_group)
    addr_grp_ratio = agclr.fit_predict(df_addr_ratio)
    
    df_addr_ratio['Addr_Group_R'] = addr_grp_ratio
    df_addr_ratio['Address'] = df_addr_ratio.index
    newgroupr = df_addr_ratio[['Address','Addr_Group_R']]
    
    # create a mapper and map to df
    mapper = newgroupr.set_index('Address')['Addr_Group_R']
    df['Addr_Group_R'] = df['Address'].map(mapper)

### Preprocess data to include address categorization

In [9]:
def process_data(sdist, vnum, rnum, df):
    """Preprocess dataframe"""
    school_proximity(sdist, df)
    group_by_vol(vnum, df)
    group_by_ratio(rnum, df)
    
    # Date related information
    df['YEAR'] = df.Dates.dt.year
    df['MONTH'] = df.Dates.dt.month
    df['DOM'] = df.Dates.dt.day
    df['DOW'] = df.Dates.dt.weekday
    df['HOUR'] = df.Dates.dt.hour
    df['MIN'] = df.Dates.dt.minute
    df['MIN_split'] = ((df.MIN == 0) | (df.MIN == 30))
    df['HOUR_RATIO'] = ((df.HOUR) == 12 | (df.HOUR == 18))
    
    # Address related information
    df['Block_split'] = df.Address.str.contains('Block')
    df['X_norm'] = (df.X-df.X.mean()) / (df.X.max() - df.X.min())
    df['Y_norm'] = (df.Y-df.Y.mean()) / (df.Y.max() - df.Y.min())
    
    # Combined Date and Address information
    df['Year_District'] = df.YEAR.astype(str).str.cat(df.PdDistrict.astype(str), sep=' ')
    

    return df
    

### Assemble preprocessed data into dataframe for modelling

In [14]:
def assemble(df):
    """Create dataframe suitable for machine learning"""
    
    # Binarize information where necessary
    df_YEAR_DISTRICT = pd.get_dummies(df.Year_District, prefix = 'Year_District')
    df_YEAR = pd.get_dummies(df.YEAR, prefix = 'Month')
    df_MONTH = pd.get_dummies(df.MONTH, prefix = 'Month')
    df_DOM = pd.get_dummies(df.DOM, prefix = 'DofM')
    df_DOW = pd.get_dummies(df.DOW, prefix = 'DofW')
    df_HOUR = pd.get_dummies(df.HOUR, prefix = 'Hour')
    df_DISTRICT = pd.get_dummies(df.PdDistrict, prefix = 'District')
    df_ADDR_GP_V = pd.get_dummies(df.Addr_Group_V, prefix = 'AddrV')
    df_ADDR_GP_R = pd.get_dummies(df.Addr_Group_R, prefix = 'AddrR')
    
    # Concatenate specific fields into dataframe
    new = pd.DataFrame(data=None, index = df.index)
    
    # Date Fields
    #new = pd.concat([new, df_YEAR], axis = 1)
    new = pd.concat([new, df_MONTH], axis = 1)
    #new = pd.concat([new, df_DOM], axis = 1)
    #new = pd.concat([new, df_DOW], axis = 1)
    #new = pd.concat([new, df_HOUR], axis = 1)
    new = pd.concat([new, df.HOUR_RATIO], axis = 1)
    new = pd.concat([new, df.MIN_split], axis = 1)    
   
    # Address Fields
    #new = pd.concat([new, df.Y_norm, df.X_norm], axis = 1)
    #new = pd.concat([new, df_ADDR_GP_V], axis = 1)
    #new = pd.concat([new, df_DISTRICT], axis = 1)
    new = pd.concat([new, df.by_school], axis = 1)
    new = pd.concat([new, df_ADDR_GP_R], axis = 1)
    new = pd.concat([new, df_ADDR_GP_V], axis = 1)
    new = pd.concat([new, df.Block_split], axis = 1)
   
    # Combination Fields
    #new = pd.concat([new, df_YEAR_DISTRICT], axis = 1)
    
    new = new.fillna(0)
    
    print 'Full dataset shape: ', new.shape
    
    return new

In [15]:
def train_test_nb(df_adj): 
    nb = BernoulliNB()
    print '\nNB Cross Val Score', cross_val_score(nb, df_adj, df.Category, scoring = 'neg_log_loss').mean()
         

In [16]:
def test_predict(df):
    nb = BernoulliNB()
    nb.fit(assemble(process_data(40,3,500,df_train)), df_train.Category)
    y_pred = nb.predict(df)
    print y_pred[:20]
    print y_pred.shape

### Run NB model with parameters for school distance and number of groups for address by crime vol and ratio 

In [17]:
df_trainset = assemble(process_data(40,3,500,df_train))

Full dataset shape:  (877982, 519)


In [18]:
df_testset = assemble(process_data(40,3,500,df_test))

Full dataset shape:  (884262, 519)


In [None]:
train_test_nb(df_trainset)

In [21]:
test_predict(df_testset)

Full dataset shape:  (877982, 519)
['MISSING PERSON' 'OTHER OFFENSES' 'BURGLARY' 'ASSAULT' 'ASSAULT'
 'OTHER OFFENSES' 'BURGLARY' 'VEHICLE THEFT' 'ASSAULT' 'LARCENY/THEFT'
 'VEHICLE THEFT' 'ASSAULT' 'ASSAULT' 'LARCENY/THEFT' 'LARCENY/THEFT'
 'OTHER OFFENSES' 'OTHER OFFENSES' 'MISSING PERSON' 'LARCENY/THEFT'
 'VEHICLE THEFT']
(884262L,)


### Do over / under sampling

In [None]:
def ros(df_adj):
    
    X_train, X_dev, y_train, y_dev = train_test_split(df_adj, df.Category, test_size = 0.5, stratify = df.Category, random_state = 42)

    dict = df.Category.value_counts()
    dict = np.log2(dict)
    dict = dict / dict.sum()
    dict = (dict*2347000)
    dict = dict.astype(int)
    dict = dict.to_dict()

    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler(ratio = dict, random_state = None)
    X_resampled, y_resampled = ros.fit_sample(X_train, y_train)
    
    nb1 = MultinomialNB()
    nb1.fit(X_resampled, y_resampled)
    y_prednb = nb1.predict(X_dev)
    print 'Oversampled NB Score\n', metrics.classification_report(y_dev, y_prednb)
    print 'Oversampled confusion matrix', metrics.confusion_matrix(y_dev, y_prednb)
    
    
    nb2 = MultinomialNB()
    nb2.fit(X_train, y_train)
    y_prednb2 = nb2.predict(X_dev)
    print 'Standard NB Score\n', metrics.classification_report(y_dev, y_prednb2)
    


In [None]:
ros(assemble(pro_df))

In [None]:
rf = RandomForestClassifier()
print '\nRF Cross Val Score', cross_val_score(rf, assemble(pro_df), df.Category, scoring = 'neg_log_loss').mean()