# Ready for data analysis

In [1]:
# import library and change directory for loading data.
import pandas as pd
import numpy as np
import os
os.chdir("../../data/sf-crime")

In [2]:
# load both train and test data.
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

print(f"Shape of train data : {train.shape}")
print(f"Shape of test data : {test.shape}")

Shape of train data : (878049, 9)
Shape of test data : (884262, 7)


In [3]:
# preview train data.
# Category is the class.
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
# preview test data.
# test data doesn't have the class(label).
# So we must predict the class of test data based on train data.
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
# Explore what kinds of labels.
print(train['Category'].unique())
print()
print(f"The number of multi-class labels: {train['Category'].nunique()}")

['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' 'VANDALISM'
 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'
 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'
 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'
 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'
 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'
 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'
 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'
 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT']

The number of multi-class labels: 39


---

## .Data fields

- Dates - timestamp of the crime incident(범죄 발생 시간)
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.(학습 데이터에만 존재. 범죄의 종류. 이게 결국 맞추고자 하는 label(class))
- Descript - detailed description of the crime incident (only in train.csv)(학습 데이터에만 존재. 해당 범죄의 자세한 설명)
- DayOfWeek - the day of the week(범죄 발생한 요일)
- PdDistrict - name of the Police Department District(경찰 부서가 속한 구역(지역))
- Resolution - how the crime incident was resolved (only in train.csv)(해당 범죄의 범인이 어떻게 잡혔는지)
- Address - the approximate street address of the crime incident(범죄사건이 발생한 대략적인 주소)
- X - Longitude(경도)
- Y - Latitude(위도)

---

## .Explanation about multi-class labels that I didn't know.
   - LARCENY/THEFT = 절도/도둑질
   - OTHER OFFENSES = 기타 범죄
   - NON-CRIMINAL = 비범죄자
   - ASSAULT = 폭행죄
   - DRUG/NARCOTIC = 불법 약물/ 마약
   - VEHICLE THEFT = 차량 절도
   - VANDALISM = 공공 기물 파손
   - WARRANTS = 지명수배
   - BURGLARY = 빈집털이
   - MISSING PERSON = 행방 불명
   - ROBBERY = 강도
   - FRAUD = 사기

---

# Explore train data

In [6]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [7]:
# What is crime category of the highest occurence in San francisco?
category_count = train.groupby('Category').agg({'Dates':'count'}).rename(columns={'Dates':'Counts'})
category_count = category_count.sort_values(by='Counts', ascending=False)
category_count.head()

Unnamed: 0_level_0,Counts
Category,Unnamed: 1_level_1
LARCENY/THEFT,174900
OTHER OFFENSES,126182
NON-CRIMINAL,92304
ASSAULT,76876
DRUG/NARCOTIC,53971


In [8]:
# Visualize the above content using plotly library.
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

category_count.iplot(kind='bar', theme='white')
# Further, use F1-score metric to my modeling for class imbalance!

In [9]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [10]:
# What the highest day of crime occurence in San francisco?
day_count = train.groupby('DayOfWeek').agg({'Dates':'count'}).rename(columns={'Dates':'Count'})
day_count = day_count.sort_values(by='Count', ascending=False)
day_count.head()

Unnamed: 0_level_0,Count
DayOfWeek,Unnamed: 1_level_1
Friday,133734
Wednesday,129211
Saturday,126810
Thursday,125038
Tuesday,124965


In [11]:
# Visualize the above dataframe.
day_count.iplot(kind='bar', theme='white')

# The difference in all day of week is somewhat a little.
# So when giving weight to the variable, don't just assign the ordinal number. 
# Give attention to the scaling of weight!

In [12]:
# Where is the district of the highest counts of crime in San francisco?
district_count = train.groupby('PdDistrict').agg({'Category':'count'}).rename(columns={'Category':'Counts'})
district_count = district_count.sort_values(by='Counts', ascending=False)
district_count.head()

Unnamed: 0_level_0,Counts
PdDistrict,Unnamed: 1_level_1
SOUTHERN,157182
MISSION,119908
NORTHERN,105296
BAYVIEW,89431
CENTRAL,85460


In [13]:
# Visuallize the above content
district_count.iplot(kind='bar', theme='white')

In [14]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [15]:
# Denote crime location in San Francisco Map
import folium
from folium.plugins import MarkerCluster

lat = train['Y'].mean()
long = train['X'].mean()

m = folium.Map(location=[lat, long], zoom_start=12)
marker_cluster = MarkerCluster().add_to(m)

# visuallize the part of raw data
for n in train.index[:5000]:
    lat = train.loc[n, 'Y']
    long = train.loc[n, 'X']
    tool_tip = train.loc[n, 'PdDistrict']+'-'+train.loc[n, 'Address']
    
    folium.CircleMarker(location=[lat, long], tooltip=tool_tip,
                       radius=4, color='red').add_to(marker_cluster)
m

In [16]:
# Extract the time from Dates.
series = train['Dates'].str.split(':', expand=True)[0]
train['Time'] = series.str.split(' ', expand=True)[1]
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Time
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,23
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,23


In [17]:
import plotly.graph_objects as go

# What time is the most frequent occurence?
time_crime = train['Time'].value_counts(ascending=False)

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=time_crime.index,
        y=time_crime.values))

fig.update_layout(
    {
        'title':{
            'text':'<b>Time when crime occured</b>',
            'x':0.5,
            'y':0.9,
            'font':{
                'size':30,
                'color':'red'
            }
        },
        'xaxis':{
            'title':'Time(Hour)',
            'showticklabels':True,
            'dtick':1
        },
        'yaxis':{
            'title':'Count of crime'
        }
    })

fig.show()

# preprocessing

In [18]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Time
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,23
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,23


In [19]:
# 요일별 범죄 유형이 어떤게 많은지 살펴보기
day_crime = train.groupby(['DayOfWeek', 'Category']).agg({'Category':'count'}).rename(columns={'Category':'Counts'})
day_crime = day_crime.reset_index()
day_crime.sort_values(by=['DayOfWeek', 'Counts'],ascending=False)

subset_df = {}
days = day_crime['DayOfWeek'].unique()
for day in days:
    subset_df[day] = day_crime[day_crime['DayOfWeek'] == day].sort_values(by='Counts', ascending=False).reset_index(drop=True).head(10)
subset_df

# 요일별로 발생하는 범죄가 다 비슷한 순위임

{'Friday':   DayOfWeek        Category  Counts
 0    Friday   LARCENY/THEFT   27104
 1    Friday  OTHER OFFENSES   18588
 2    Friday    NON-CRIMINAL   13984
 3    Friday         ASSAULT   11160
 4    Friday   VEHICLE THEFT    8613
 5    Friday   DRUG/NARCOTIC    7420
 6    Friday       VANDALISM    7092
 7    Friday        BURGLARY    6327
 8    Friday        WARRANTS    5926
 9    Friday  SUSPICIOUS OCC    4924,
 'Monday':   DayOfWeek        Category  Counts
 0    Monday   LARCENY/THEFT   23570
 1    Monday  OTHER OFFENSES   17787
 2    Monday    NON-CRIMINAL   12855
 3    Monday         ASSAULT   10560
 4    Monday   DRUG/NARCOTIC    7823
 5    Monday   VEHICLE THEFT    7412
 6    Monday       VANDALISM    5946
 7    Monday        WARRANTS    5811
 8    Monday        BURGLARY    5262
 9    Monday  SUSPICIOUS OCC    4447,
 'Saturday':   DayOfWeek        Category  Counts
 0  Saturday   LARCENY/THEFT   27217
 1  Saturday  OTHER OFFENSES   17129
 2  Saturday    NON-CRIMINAL   14007
 3  

In [20]:
# 시간대별 어느 범죄가 많이 발생했는지 보기
time_crime = train.groupby(['Time','Category']).agg({'Category':'count'}).rename(columns={'Category':'Counts'})
time_crime = time_crime.reset_index()
time_cirme = time_crime.sort_values(by=['Time','Counts'], ascending=False)

subset_df = {}
times = time_crime['Time'].unique()
for time in times:
    subset_df[time] = time_crime[time_crime['Time'] == time].sort_values(by='Counts', ascending=False).reset_index(drop=True).head(10)

In [21]:
subset_df

{'00':   Time        Category  Counts
 0   00  OTHER OFFENSES    7375
 1   00   LARCENY/THEFT    7019
 2   00    NON-CRIMINAL    4305
 3   00         ASSAULT    4291
 4   00       VANDALISM    2681
 5   00           FRAUD    2297
 6   00   VEHICLE THEFT    2035
 7   00  SUSPICIOUS OCC    2033
 8   00   DRUG/NARCOTIC    1703
 9   00        WARRANTS    1680,
 '01':   Time        Category  Counts
 0   01   LARCENY/THEFT    4304
 1   01         ASSAULT    3789
 2   01  OTHER OFFENSES    3630
 3   01    NON-CRIMINAL    2292
 4   01       VANDALISM    1832
 5   01   VEHICLE THEFT    1392
 6   01         ROBBERY    1274
 7   01        WARRANTS    1205
 8   01   DRUG/NARCOTIC    1043
 9   01  SUSPICIOUS OCC     844,
 '02':   Time        Category  Counts
 0   02         ASSAULT    3454
 1   02  OTHER OFFENSES    3057
 2   02   LARCENY/THEFT    2957
 3   02       VANDALISM    1819
 4   02    NON-CRIMINAL    1789
 5   02         ROBBERY    1367
 6   02   VEHICLE THEFT    1116
 7   02        WARRA

In [22]:
# District에 따라 어떤 범죄가 발생했는지 보기
dist_crime = train.groupby(['PdDistrict', 'Category']).agg({'Category':'count'}).rename(columns={'Category':'Counts'})
dist_crime = dist_crime.reset_index()

subset_df = {}
dists = dist_crime['PdDistrict'].unique()
for dist in dists:
    subset_df[dist] = dist_crime[dist_crime['PdDistrict'] == dist].sort_values(by='Counts', ascending=False).reset_index(drop=True).head(10)
subset_df

{'BAYVIEW':   PdDistrict        Category  Counts
 0    BAYVIEW  OTHER OFFENSES   17053
 1    BAYVIEW   LARCENY/THEFT   10119
 2    BAYVIEW         ASSAULT    9857
 3    BAYVIEW   VEHICLE THEFT    7219
 4    BAYVIEW    NON-CRIMINAL    6099
 5    BAYVIEW       VANDALISM    5356
 6    BAYVIEW  MISSING PERSON    5038
 7    BAYVIEW   DRUG/NARCOTIC    4498
 8    BAYVIEW        WARRANTS    4322
 9    BAYVIEW        BURGLARY    3930,
 'CENTRAL':   PdDistrict        Category  Counts
 0    CENTRAL   LARCENY/THEFT   25060
 1    CENTRAL    NON-CRIMINAL   10940
 2    CENTRAL  OTHER OFFENSES    8901
 3    CENTRAL         ASSAULT    6977
 4    CENTRAL        BURGLARY    4519
 5    CENTRAL       VANDALISM    4469
 6    CENTRAL   VEHICLE THEFT    4210
 7    CENTRAL  SUSPICIOUS OCC    2842
 8    CENTRAL        WARRANTS    2777
 9    CENTRAL           FRAUD    2344,
 'INGLESIDE':   PdDistrict        Category  Counts
 0  INGLESIDE  OTHER OFFENSES   13203
 1  INGLESIDE   LARCENY/THEFT   10236
 2  INGLESIDE

In [23]:
# 시간대별 - 어떤범죄? -> 비슷한분포 -> 레이블인코딩하고 standardscaler
# x 디스트릭트별 - 어떤범죄? -> one-hot-encoding 해보기 (standardscaler적용해서 성능 높이기 가능한지도 확인..!)
# 요일별 - 어떤범죄? -> 다비슷 ->레이블인코딩하고 standardscaler 

# 모델은 KNN, MLPClassifier
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Time
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,23
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,23


In [3]:
# 인코딩하는 함수 정의해주기
from sklearn.preprocessing import LabelEncoder

def preprocessing(df):
    series = df['Dates'].str.split(':', expand=True)[0]
    df['Dates'] = series.str.split(' ', expand=True)[1]
    df['Dates'] = df['Dates'].astype(int)
    df.rename(columns={'Dates':'Time'}, inplace=True)
    
    if 'Id' in df.columns:
            df.drop(['Id','Address','X','Y'], axis=1, inplace=True)
    else:
        df.drop(['Descript', 'Resolution', 'Address', 'X', 'Y'], axis=1, inplace=True)
    
    if 'Category' in df.columns:
        encoders = {}
        features = ['Category', 'DayOfWeek','PdDistrict']
        for feature in features:
            encoder = LabelEncoder()
            encoders[feature] = encoder
            df[feature] = encoder.fit_transform(df[feature])
    else:
        encoders = {}
        features = ['DayOfWeek','PdDistrict']
        for feature in features:
            encoder = LabelEncoder()
            encoders[feature] = encoder
            df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [4]:
train_d = preprocessing(train)
test_d = preprocessing(test)
print(f" A shape of train_d {train_d.shape}")
print(f" A shape of test_d {test_d.shape}")

 A shape of train_d (878049, 4)
 A shape of test_d (884262, 3)


In [5]:
print(train_d.shape)

(878049, 4)


In [6]:
train_d.head()

Unnamed: 0,Time,Category,DayOfWeek,PdDistrict
0,23,37,6,4
1,23,21,6,4
2,23,21,6,4
3,23,16,6,4
4,23,16,6,5


In [7]:
features = train_d.columns.tolist()
features.remove('Category')
print(features)

['Time', 'DayOfWeek', 'PdDistrict']


In [37]:
# metric ofr multi-class classification 
from sklearn.metrics import multilabel_confusion_matrix

def get_score(test_y, pred_y):
    matrix = multilabel_confusion_matrix(test_y, pred_y)
    print("다중 클래스 Confusion Matrix :\n", matrix)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_x, test_x, train_y, test_y = train_test_split(train_d[features], train_d['Category'],
                                                   test_size=0.2, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,
                               random_state=42)
rf_clf.fit(train_x, train_y)
pred_y = rf_clf.predict(test_x)
acc = accuracy_score(test_y, pred_y)
print(f"정확도 {acc :.4f}")

정확도 0.2250


In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
features = ['Time', 'DayOfWeek', 'PdDistrict']

train_x, test_x, train_y, test_y = train_test_split(train_d[features], train_d['Category'],
                                                   test_size=0.2, random_state=42)

train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

dt_clf = KNeighborsClassifier()
dt_clf.fit(train_x, train_y)
pred_y = dt_clf.predict(test_x)

accuracy_score(test_y, pred_y)

0.1409486931268151

In [27]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

features = ['Time', 'DayOfWeek', 'PdDistrict']

train_x, test_x, train_y, test_y = train_test_split(train_d[features], train_d['Category'],
                                                   test_size=0.2, random_state=42)

dt_clf = MLPClassifier()
dt_clf.fit(train_x, train_y)
pred_y = dt_clf.predict(test_x)
accuracy_score(test_y, pred_y)



0.2204885826547463