In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB              #朴素贝叶斯的库

In [7]:
train = pd.read_csv('Kaggle旧金山犯罪类型分类/train.csv', parse_dates = ['Dates'])
test = pd.read_csv('Kaggle旧金山犯罪类型分类/test.csv', parse_dates = ['Dates'])

In [6]:
train

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


In [8]:
test

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
...,...,...,...,...,...,...,...
884257,884257,2003-01-01 00:01:00,Wednesday,MISSION,2600 Block of BRYANT ST,-122.408983,37.751987
884258,884258,2003-01-01 00:01:00,Wednesday,NORTHERN,1900 Block of WASHINGTON ST,-122.425342,37.792681
884259,884259,2003-01-01 00:01:00,Wednesday,INGLESIDE,5500 Block of MISSION ST,-122.445418,37.712075
884260,884260,2003-01-01 00:01:00,Wednesday,BAYVIEW,1500 Block of HUDSON AV,-122.387394,37.739479


In [None]:
'''
Date: 日期
Category: 犯罪类型，比如 Larceny/盗窃罪 等.
Descript: 对于犯罪更详细的描述
DayOfWeek: 星期几
PdDistrict: 所属警区
Resolution: 处理结果，比如说『逮捕』『逃了』
Address: 发生街区位置
X and Y: GPS坐标
train.csv中的数据时间跨度为12年，包含了将近90w的记录。另外，这部分数据，大家从上图上也可以看出来，大部分都是『类别』型，比如犯罪类型，比如星期几。
'''

In [18]:
#特征处理

leCrime = LabelEncoder()
crime = leCrime.fit_transform(train.Category)
#将犯罪类型进行编码

#用get_dummies因子化星期几、街区、小时等特征  
days=pd.get_dummies(train.DayOfWeek)              #日期
district = pd.get_dummies(train.PdDistrict)       #地区
hour = train.Dates.dt.hour                        #时间
hour = pd.get_dummies(hour) 

#组合特征  
trainData = pd.concat([hour, days, district], axis = 1)  #将特征进行横向组合  
trainData['crime'] = crime                               #追加'crime'列  

#处理测试集
days = pd.get_dummies(test.DayOfWeek)  
district = pd.get_dummies(test.PdDistrict)  
hour = test.Dates.dt.hour  
hour = pd.get_dummies(hour)  
testData = pd.concat([hour, days, district], axis=1) 

In [19]:
trainData

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,crime
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,37
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,21
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,21
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,16
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,25
878045,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,16
878046,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,16
878047,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,35


In [20]:
testData

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
884258,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
884259,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
884260,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [32]:
features=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',  
 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']  
x_train, x_test, y_train, y_test = train_test_split(trainData[features], trainData["crime"], train_size = 0.8, random_state = 42)
NB = BernoulliNB() 
NB.fit(x_train, y_train)
y_pred = NB.predict(x_test)                  #预测的结果
propa = NB.predict_proba(x_test)             #每一种犯罪的概率
predicted = np.array(propa)
logLoss=log_loss(y_test, predicted)          #朴素贝叶斯的log损失

In [25]:
y_pred

array([ 7, 16, 16, ..., 16, 21,  7])

In [30]:
predicted

array([[8.27871590e-04, 1.11855398e-01, 8.97585280e-05, ...,
        1.19879617e-02, 8.89551376e-02, 9.38387583e-03],
       [1.19201028e-03, 7.66502712e-02, 1.92156944e-04, ...,
        8.57769498e-02, 3.93916696e-02, 7.01307492e-03],
       [1.09445118e-03, 8.13536312e-02, 3.09571669e-04, ...,
        2.83047774e-02, 4.98885102e-02, 6.78084468e-03],
       ...,
       [2.40034964e-03, 6.98515364e-02, 8.39730390e-04, ...,
        9.27840880e-02, 2.12205187e-02, 7.18884097e-03],
       [2.45787895e-03, 1.07970059e-01, 4.36636005e-04, ...,
        1.22475469e-01, 2.90691736e-02, 1.46635991e-02],
       [8.27871590e-04, 1.11855398e-01, 8.97585280e-05, ...,
        1.19879617e-02, 8.89551376e-02, 9.38387583e-03]])

In [33]:
print("朴素贝叶斯的log损失为:%.6f"%logLoss) 


朴素贝叶斯的log损失为:2.613689
