In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [7]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [8]:
train_df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [9]:
test_df.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null object
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 7 columns):
Id            884262 non-null int64
Dates         884262 non-null object
DayOfWeek     884262 non-null object
PdDistrict    884262 non-null object
Address       884262 non-null object
X             884262 non-null float64
Y             884262 non-null float64
dtypes: float64(2), int64(1), object(4)
memory usage: 47.2+ MB


In [29]:
import csv
from datetime import datetime
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

def parse_time(x):
    DD=datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
    time=DD.hour#*60+DD.minute
    day=DD.day
    month=DD.month
    year=DD.year
    return time,day,month,year

def get_season(x):
    summer=0
    fall=0
    winter=0
    spring=0
    if (x in [5, 6, 7]):
        summer=1
    if (x in [8, 9, 10]):
        fall=1
    if (x in [11, 0, 1]):
        winter=1
    if (x in [2, 3, 4]):
        spring=1
    return summer, fall, winter, spring

train = pd.read_csv('train.csv')
y= train.Category.values
dates = train.Dates.values
dates = [parse_time(date) for date in dates]
seasons = [get_season(month) for _,_,month,_ in dates]
train = train.drop(['Address','Category','Dates','Descript','X','Y','Resolution'],axis=1)
train['Hour'] = pd.Series([item[0] for item in dates])
train['Month'] = pd.Series([item[2] for item in dates])
train['Summer'] = pd.Series([item[0] for item in seasons])
train['Fall'] = pd.Series([item[1] for item in seasons])
train['Winter'] = pd.Series([item[2] for item in seasons])
train['Spring'] = pd.Series([item[3] for item in seasons])

days = {}
cnt=0
for i in np.unique(train.DayOfWeek.values):
    days[i] = cnt
    cnt+=1

categories_map = {'DayOfWeek' : days}

PdDis = {}
cnt=0
for i in np.unique(train.PdDistrict.values):
    PdDis[i] = cnt
    cnt+=1

categories_map['PdDistrict'] = PdDis

train = train.replace(categories_map)

#model = BernoulliNB()
model = GaussianNB()
model.fit(train,y)

test = pd.read_csv('test.csv')
idx = test.Id.values
dates = test.Dates.values
dates = [parse_time(date) for date in dates]
seasons = [get_season(month) for _,_,month,_ in dates]
test = test.drop(['Id','Dates','Address','X','Y'],axis=1)
test['Hour'] = pd.Series([item[0] for item in dates])
test['Month'] = pd.Series([item[2] for item in dates])
test['Summer'] = pd.Series([item[0] for item in seasons])
test['Fall'] = pd.Series([item[1] for item in seasons])
test['Winter'] = pd.Series([item[2] for item in seasons])
test['Spring'] = pd.Series([item[3] for item in seasons])

test = test.replace(categories_map)

predicted = np.array(model.predict_proba(test))
labels = ['Id']
for i in model.classes_:
    labels.append(i)
with open('bernoulinb.csv', 'wb') as outf:
  fo = csv.writer(outf, lineterminator='\n')
  fo.writerow(labels)

  for i, pred in zip(idx, predicted):
    fo.writerow([i] + list(pred))

In [27]:
train_df.replace(categories_map).head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,6,4,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,6,4,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,6,4,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,6,4,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,6,5,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [68]:
from sklearn.cross_validation import StratifiedShuffleSplit
aa = StratifiedShuffleSplit(pd.Series(np.array(range(100))), train_size=0.8)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of labels for any class cannot be less than 2.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import LogNorm
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from copy import deepcopy

trainDF=pd.read_csv("train.csv")

xy_scaler=preprocessing.StandardScaler()
xy_scaler.fit(trainDF[["X","Y"]])
trainDF[["X","Y"]]=xy_scaler.transform(trainDF[["X","Y"]])
trainDF=trainDF[abs(trainDF["Y"])<100]
trainDF.index=range(len(trainDF))

def parse_time(x):
    DD=datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
    time=DD.hour#*60+DD.minute
    day=DD.day
    month=DD.month
    year=DD.year
    return time,day,month,year

def get_season(x):
    summer=0
    fall=0
    winter=0
    spring=0
    if (x in [5, 6, 7]):
        summer=1
    if (x in [8, 9, 10]):
        fall=1
    if (x in [11, 0, 1]):
        winter=1
    if (x in [2, 3, 4]):
        spring=1
    return summer, fall, winter, spring


def parse_data(df,logodds,logoddsPA):
    feature_list=df.columns.tolist()
    if "Descript" in feature_list:
        feature_list.remove("Descript")
    if "Resolution" in feature_list:
        feature_list.remove("Resolution")
    if "Category" in feature_list:
        feature_list.remove("Category")
    if "Id" in feature_list:
        feature_list.remove("Id")
    cleanData=df[feature_list]
    cleanData.index=range(len(df))
    print("Creating address features")
    address_features=cleanData["Address"].apply(lambda x: logodds[x])
    address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]
    print("Parsing dates")
    cleanData["Time"], cleanData["Day"], cleanData["Month"], cleanData["Year"]=zip(*cleanData["Dates"].apply(parse_time))
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    print("Creating one-hot variables")
    dummy_ranks_PD = pd.get_dummies(cleanData['PdDistrict'], prefix='PD')
    dummy_ranks_DAY = pd.get_dummies(cleanData["DayOfWeek"], prefix='DAY')
    cleanData["IsInterection"]=cleanData["Address"].apply(lambda x: 1 if "/" in x else 0)
    cleanData["logoddsPA"]=cleanData["Address"].apply(lambda x: logoddsPA[x])
    print("droping processed columns")
    cleanData=cleanData.drop("PdDistrict",axis=1)
    cleanData=cleanData.drop("DayOfWeek",axis=1)
    cleanData=cleanData.drop("Address",axis=1)
    cleanData=cleanData.drop("Dates",axis=1)
    feature_list=cleanData.columns.tolist()
    print("joining one-hot features")
    features = cleanData[feature_list].join(dummy_ranks_PD.ix[:,:]).join(dummy_ranks_DAY.ix[:,:]).join(address_features.ix[:,:])
    print("creating new features")
    features["IsDup"]=pd.Series(features.duplicated()|features.duplicated(keep='last')).apply(int)
    features["Awake"]=features["Time"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
    features["Summer"], features["Fall"], features["Winter"], features["Spring"]=zip(*features["Month"].apply(get_season))
    if "Category" in df.columns:
        labels = df["Category"].astype('category')
    else:
        labels=None
    return features,labels


addresses=sorted(trainDF["Address"].unique())
categories=sorted(trainDF["Category"].unique())
C_counts=trainDF.groupby(["Category"]).size()
A_C_counts=trainDF.groupby(["Address","Category"]).size()
A_counts=trainDF.groupby(["Address"]).size()
logodds={}
logoddsPA={}
MIN_CAT_COUNTS=2
default_logodds=np.log(C_counts/len(trainDF))-np.log(1.0-C_counts/float(len(trainDF)))
for addr in addresses:
    PA=A_counts[addr]/float(len(trainDF))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    for cat in A_C_counts[addr].keys():
        if (A_C_counts[addr][cat]>MIN_CAT_COUNTS) and A_C_counts[addr][cat]<A_counts[addr]:
            PA=A_C_counts[addr][cat]/float(A_counts[addr])
            logodds[addr][categories.index(cat)]=np.log(PA)-np.log(1.0-PA)
    logodds[addr]=pd.Series(logodds[addr])
    logodds[addr].index=range(len(categories))


features, labels=parse_data(trainDF,logodds,logoddsPA)


print(features.columns.tolist())
print(len(features.columns))


collist=features.columns.tolist()
scaler = preprocessing.StandardScaler()
scaler.fit(features)
features[collist]=scaler.transform(features)


new_PCA=PCA(n_components=60)
new_PCA.fit(features)
print(new_PCA.explained_variance_ratio_)


sss = StratifiedShuffleSplit(labels, train_size=0.5)
for train_index, test_index in sss:
    features_train,features_test=features.iloc[train_index],features.iloc[test_index]
    labels_train,labels_test=labels[train_index],labels[test_index]
features_test.index=range(len(features_test))
features_train.index=range(len(features_train))
labels_train.index=range(len(labels_train))
labels_test.index=range(len(labels_test))
features.index=range(len(features))
labels.index=range(len(labels))


#model = LogisticRegression()
#model = tree.DecisionTreeClassifier()
#model = svm.LinearSVC()
#model = GaussianNB()
model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(70, 50), random_state=1)
#model = RandomForestClassifier(n_estimators=20)
#model.fit(features_train,labels_train)

#print("all", log_loss(labels, model.predict_proba(features.as_matrix())))
#print("train", log_loss(labels_train, model.predict_proba(features_train.as_matrix())))
#print("test", log_loss(labels_test, model.predict_proba(features_test.as_matrix())))


model.fit(features,labels)

print("all", log_loss(labels, model.predict_proba(features.as_matrix())))
#print("train", log_loss(labels_train, model.predict_proba(features_train.as_matrix())))
#print("test", log_loss(labels_test, model.predict_proba(features_test.as_matrix())))


testDF=pd.read_csv("test.csv")
testDF[["X","Y"]]=xy_scaler.transform(testDF[["X","Y"]])
#set outliers to 0
testDF["X"]=testDF["X"].apply(lambda x: 0 if abs(x)>5 else x)
testDF["Y"]=testDF["Y"].apply(lambda y: 0 if abs(y)>5 else y)


new_addresses=sorted(testDF["Address"].unique())
new_A_counts=testDF.groupby("Address").size()
only_new=set(new_addresses+addresses)-set(addresses)
only_old=set(new_addresses+addresses)-set(new_addresses)
in_both=set(new_addresses).intersection(addresses)
for addr in only_new:
    PA=new_A_counts[addr]/float(len(testDF)+len(trainDF))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)
    logodds[addr]=deepcopy(default_logodds)
    logodds[addr].index=range(len(categories))
for addr in in_both:
    PA=(A_counts[addr]+new_A_counts[addr])/float(len(testDF)+len(trainDF))
    logoddsPA[addr]=np.log(PA)-np.log(1.-PA)


features_sub, _=parse_data(testDF,logodds,logoddsPA)


collist=features_sub.columns.tolist()
print(collist)


features_sub[collist]=scaler.transform(features_sub[collist])


predDF=pd.DataFrame(model.predict_proba(features_sub.as_matrix()),columns=sorted(labels.unique()))


predDF.head()

import gzip
with gzip.GzipFile('RandomForest.csv.gz',mode='w',compresslevel=9) as gzfile:
    predDF.to_csv(gzfile,index_label="Id",na_rep="0")
    print("==================all done=================")



In [12]:
s = "0.00103022009209,0.111883082285,1.25946088572e-06,1.16907015337e-05,0.00136699987112,0.000273281859421,0.00305399420692,0.0138110271206,0.000805834418678,1.42812670958e-05,1.6410150784e-05,1.47344289862e-05,0.000327844842543,0.00119250356415,3.96055909231e-06,0.000845312148886,0.0932998902208,0.000375985835547,6.00366437823e-05,0.00182354223107,0.0675508264392,0.405891363457,2.58756800091e-07,7.60265343619e-05,0.00252678283343,0.0958082832646,0.000101119266011,8.10263770623e-06,0.00131649046171,5.43334921318e-06,0.000677470281865,6.46706140843e-06,0.0289890678787,1.31130085414e-09,0.000292700758947,0.0467238117879,0.0655303606934,0.0480546619422,0.00622887937512"
l = [float(i) for i in s.split(',')]
l.sort(reverse=True)
print l 

[0.405891363457, 0.111883082285, 0.0958082832646, 0.0932998902208, 0.0675508264392, 0.0655303606934, 0.0480546619422, 0.0467238117879, 0.0289890678787, 0.0138110271206, 0.00622887937512, 0.00305399420692, 0.00252678283343, 0.00182354223107, 0.00136699987112, 0.00131649046171, 0.00119250356415, 0.00103022009209, 0.000845312148886, 0.000805834418678, 0.000677470281865, 0.000375985835547, 0.000327844842543, 0.000292700758947, 0.000273281859421, 0.000101119266011, 7.60265343619e-05, 6.00366437823e-05, 1.6410150784e-05, 1.47344289862e-05, 1.42812670958e-05, 1.16907015337e-05, 8.10263770623e-06, 6.46706140843e-06, 5.43334921318e-06, 3.96055909231e-06, 1.25946088572e-06, 2.58756800091e-07, 1.31130085414e-09]
