In [1]:
import pandas as pd
import numpy as np

# 讀取與訓練資料
dir = './sf-crime/'
df_train = pd.read_csv(dir + 'train.csv')
df_test = pd.read_csv(dir + 'test.csv')
df_train.head(5)

# 重組資料
features = ['Dates', 'DayOfWeek', 'PdDistrict', 'X', 'Y']
train_X = df_train[features]
train_Y = df_train['Category']

In [2]:
# 去除空值
df_train['Resolution']  = df_train['Resolution'].replace('NONE', 0)

In [3]:
df_train.columns

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')

In [4]:
#從 Dates 抓取小時
import datetime
# 2015-05-13 19:52:00
df_train['Dates'] = df_train['Dates'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_train['Dates_Hours'] = df_train['Dates'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df_train.drop(['Dates'], axis=1, inplace=True)

In [5]:
df_train.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Dates_Hours
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,0,1500 Block of LOMBARD ST,-122.426995,37.800873,23
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,0,100 Block of BRODERICK ST,-122.438738,37.771541,23


In [6]:
# one hot encoding
features_dummy = ['DayOfWeek', 'PdDistrict', 'Dates_Hours']
df_test = df_train[features_dummy]
# df_test['Dates_Hours'] = df_test['Dates_Hours'].astype('str') warnings
df_test = df_test.astype({"Dates_Hours": str})
train_X = pd.get_dummies(df_test)

In [7]:
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.25, random_state=4)


# 建立模型
clf = DecisionTreeClassifier(max_depth=5)

# 訓練模型
clf = clf.fit(x_train, y_train)

In [20]:
%time
depth_list = [6,7,8,9,10,11,12]
depth_acc = []
for depth in depth_list:
    clf = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
    # 訓練模型
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    print('depth: ', depth)
    print("Acuuracy: ", round(acc,2))
    print('----------')
    depth_acc.append(acc)

Wall time: 0 ns
depth:  6
Acuuracy:  0.22
----------
depth:  7
Acuuracy:  0.22
----------
depth:  8
Acuuracy:  0.22
----------
depth:  9
Acuuracy:  0.22
----------
depth:  10
Acuuracy:  0.22
----------
depth:  11
Acuuracy:  0.22
----------
depth:  12
Acuuracy:  0.22
----------


In [8]:
plot_tree(clf.fit(x_train, y_train))

[Text(299.2173913043478, 338.79999999999995, 'X[16] <= 0.5\nentropy = 0.903\nsamples = 658536\nvalue = [1149, 57650, 312, 212, 27545, 3248, 1699, 40438, 3202\n857, 190, 388, 7981, 12545, 111, 1771, 131367, 1439\n890, 19479, 69228, 94782, 17, 5595, 2318, 17263, 1460\n7510, 3296, 115, 3390, 369, 23476, 5, 5440, 33492\n40290, 31607, 6410]'),
 Text(169.82608695652175, 277.2, 'X[7] <= 0.5\nentropy = 0.901\nsamples = 596967\nvalue = [1098, 51880, 296, 200, 26439, 2623, 1626, 27125, 2858\n789, 184, 359, 7570, 11680, 103, 1619, 123907, 1304\n745, 18844, 63623, 84401, 16, 4922, 2220, 15613, 1455\n6966, 3048, 111, 3137, 350, 21595, 5, 4571, 32290\n39514, 26075, 5806]'),
 Text(86.26086956521739, 215.59999999999997, 'X[9] <= 0.5\nentropy = 0.897\nsamples = 529930\nvalue = [795, 44442, 271, 159, 23500, 2460, 1490, 23807, 2671\n718, 174, 302, 6980, 11014, 84, 1383, 116342, 1214\n709, 15068, 59052, 71649, 14, 4868, 1684, 13562, 1259\n5720, 2759, 90, 2834, 327, 18625, 2, 4104, 28256\n34115, 22833, 459

In [9]:
from sklearn import datasets, metrics

#預測測試集
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.22082974584648746


In [10]:
import graphviz

dot_data = export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("decision_tree") 

'decision_tree.pdf'

In [11]:
# target_names = ['class 0', 'class 1', 'class 2']
import warnings
warnings.filterwarnings('ignore')
target_names=[]
for i in range(0, 39):
    target_names.append('class '+str(i))
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.00      0.00      0.00       364
     class 1       0.18      0.02      0.03     19226
     class 2       0.00      0.00      0.00        94
     class 3       0.00      0.00      0.00        77
     class 4       0.00      0.00      0.00      9210
     class 5       0.14      0.01      0.02      1072
     class 6       0.00      0.00      0.00       569
     class 7       0.23      0.31      0.26     13533
     class 8       0.00      0.00      0.00      1078
     class 9       0.00      0.00      0.00       309
    class 10       0.00      0.00      0.00        66
    class 11       0.00      0.00      0.00       103
    class 12       0.00      0.00      0.00      2628
    class 13       0.00      0.00      0.00      4134
    class 14       0.00      0.00      0.00        35
    class 15       0.00      0.00      0.00       570
    class 16       0.25      0.72      0.37     43533
    class 17       0.00    

In [26]:
from sklearn.ensemble import RandomForestClassifier
%time

rfc = RandomForestClassifier(max_depth=10, n_estimators=500)
rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)
print(f'Accuracy Score: {metrics.accuracy_score(y_pred_rfc, y_test)}')

Wall time: 0 ns
Accuracy Score: 0.2218182977773526
