In [None]:
'''
입력
- Dates, 월, 일, 시간, 요일
- PdDistrict 
- 위도, 경도

출력
- Category

'''

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold #자동으로 K번 포개주는 것
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os


# seed 값 설정
seed = 0
np.random.seed(seed)
tf.random.set_seed(3)

# CSV 값 받기

In [87]:
df = pd.read_csv("./train.csv")
df["Dates"] = df["Dates"].astype('datetime64[ns]')
df.head()
df2 = df[:]

# 결측치 체크

In [54]:
df.isnull().any().sum()

0

In [85]:
df["year"] = df["Dates"].dt.year 
df["month"] = df["Dates"].dt.month 
df["day"] = df["Dates"].dt.day
df["hour"] = df["Dates"].dt.hour
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,year,month,day,hour
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,13,23
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,13,23
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,13,23


# 문자를 숫자로

In [57]:
cObject = ["Category", "DayOfWeek","PdDistrict"]
print(df["Category"].unique())
def changeTonum(cObject):
    for objectCol in cObject:
        sList = df[objectCol].unique()
        df[objectCol]=df[objectCol].replace(sList.tolist(),np.arange(sList.shape[0]).tolist())

changeTonum(cObject)
df.head()

['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' 'VANDALISM'
 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'
 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'
 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'
 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'
 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'
 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'
 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'
 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT']


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,year,month,day,hour
0,2015-05-13 23:53:00,0,WARRANT ARREST,0,0,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23
1,2015-05-13 23:53:00,1,TRAFFIC VIOLATION ARREST,0,0,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23
2,2015-05-13 23:33:00,1,TRAFFIC VIOLATION ARREST,0,0,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,13,23
3,2015-05-13 23:30:00,2,GRAND THEFT FROM LOCKED AUTO,0,0,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,2015,5,13,23
4,2015-05-13 23:30:00,2,GRAND THEFT FROM LOCKED AUTO,0,1,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,2015,5,13,23


# 정규화

In [7]:
colList = ['DayOfWeek','PdDistrict','X','Y','year','month','day','hour']

def normalize(colList):
    norList = (0.0,0.0)
    for col in colList:
        print(df[col].max())
        norList +=  (df[col].min(),df[col].max())
        df[col] = (df[col]-df[col].min())/(df[col].max()-df[col].min())
    return norList
norList = normalize(colList)
df.head()
norList

6
9
-120.5
90.0
2015
12
31
23


(0.0,
 0.0,
 0,
 6,
 0,
 9,
 -122.51364209999998,
 -120.5,
 37.70787902,
 90.0,
 2003,
 2015,
 1,
 12,
 1,
 31,
 0,
 23)

# 딥러닝

In [65]:
X = df[colList]
print(X.shape)

Y = np_utils.to_categorical(df['Category'].tolist())
print(Y.shape)

(878049, 8)
(878049, 39)


In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
print(X_train.shape)
print(Y_train.shape)

# 딥러닝 구조를 결정합니다(모델을 설정하고 실행하는 부분입니다).
model = Sequential()
XSize = X_train.shape[1]
YSize =  Y_train.shape[1]
print(XSize)
print(YSize)
# 입력데이터 17개의 값을 받아 은닉층 30개 노드로 보낸다
model.add(Dense(YSize*4, input_dim=XSize, activation='relu'))
model.add(Dense(YSize*2, activation='relu'))
model.add(Dense(YSize, activation='softmax'))

(614634, 8)
(614634, 39)
8
39


In [67]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [68]:
model.fit(X_train, Y_train,epochs=3, batch_size=100)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x22c80370d08>

In [103]:
predictList = model.predict(X_test)
predict = np.argmax(predictList)==np.argmax(Y_test)
print(f'정답률 : {(np.sum(predict)/X_test.shape[0])*100}')

정답률 : 0.0


# Random Forest

In [105]:
# Random Forest Model
# n_estimators : 5000 -> 100으로 수정했습니다
model = RandomForestRegressor(n_estimators=3,
                              n_jobs=-1,
                              random_state=0)


model.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=3, n_jobs=-1, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [106]:
predict = np.sum((np.max(model.predict(X_test))==Y_test))
model.predict(X_test)[0]

array([0.        , 0.44444444, 0.        , 0.        , 0.        ,
       0.22222222, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.33333333, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

In [101]:
grouped = df2['Dates'].groupby(df2['Category'])
grouped.count()

Category
ARSON                            1513
ASSAULT                         76876
BAD CHECKS                        406
BRIBERY                           289
BURGLARY                        36755
DISORDERLY CONDUCT               4320
DRIVING UNDER THE INFLUENCE      2268
DRUG/NARCOTIC                   53971
DRUNKENNESS                      4280
EMBEZZLEMENT                     1166
EXTORTION                         256
FAMILY OFFENSES                   491
FORGERY/COUNTERFEITING          10609
FRAUD                           16679
GAMBLING                          146
KIDNAPPING                       2341
LARCENY/THEFT                  174900
LIQUOR LAWS                      1903
LOITERING                        1225
MISSING PERSON                  25989
NON-CRIMINAL                    92304
OTHER OFFENSES                 126182
PORNOGRAPHY/OBSCENE MAT            22
PROSTITUTION                     7484
RECOVERED VEHICLE                3138
ROBBERY                         23000
RUN