# Ready for data analysis

In [1]:
# import library and change directory for loading data.
import pandas as pd
import numpy as np
import os
os.chdir("../../data/sf-crime")

In [2]:
# load both train and test data.
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

print(f"Shape of train data : {train.shape}")
print(f"Shape of test data : {test.shape}")

Shape of train data : (878049, 9)
Shape of test data : (884262, 7)


In [3]:
# preview train data.
# Category is the class.
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
# preview test data.
# test data doesn't have the class(label).
# So we must predict the class of test data based on train data.
test.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
# Explore what kinds of labels.
print(train['Category'].unique())
print()
print(f"The number of multi-class labels: {train['Category'].nunique()}")

['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' 'VANDALISM'
 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'
 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'
 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'
 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'
 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'
 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'
 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'
 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT']

The number of multi-class labels: 39


---

## .Data fields

- Dates - timestamp of the crime incident(범죄 발생 시간)
- Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.(학습 데이터에만 존재. 범죄의 종류. 이게 결국 맞추고자 하는 label(class))
- Descript - detailed description of the crime incident (only in train.csv)(학습 데이터에만 존재. 해당 범죄의 자세한 설명)
- DayOfWeek - the day of the week(범죄 발생한 요일)
- PdDistrict - name of the Police Department District(경찰 부서가 속한 구역(지역))
- Resolution - how the crime incident was resolved (only in train.csv)(해당 범죄의 범인이 어떻게 잡혔는지)
- Address - the approximate street address of the crime incident(범죄사건이 발생한 대략적인 주소)
- X - Longitude(경도)
- Y - Latitude(위도)

---

## .Explanation about multi-class labels that I didn't know.
   - LARCENY/THEFT = 절도/도둑질
   - OTHER OFFENSES = 기타 범죄
   - NON-CRIMINAL = 비범죄자
   - ASSAULT = 폭행죄
   - DRUG/NARCOTIC = 불법 약물/ 마약
   - VEHICLE THEFT = 차량 절도
   - VANDALISM = 공공 기물 파손
   - WARRANTS = 지명수배
   - BURGLARY = 빈집털이
   - MISSING PERSON = 행방 불명
   - ROBBERY = 강도
   - FRAUD = 사기

---

# Explore train data

In [6]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [7]:
# What is crime category of the highest occurence in San francisco?
category_count = train.groupby('Category').agg({'Dates':'count'}).rename(columns={'Dates':'Counts'})
category_count = category_count.sort_values(by='Counts', ascending=False)
category_count.head()

Unnamed: 0_level_0,Counts
Category,Unnamed: 1_level_1
LARCENY/THEFT,174900
OTHER OFFENSES,126182
NON-CRIMINAL,92304
ASSAULT,76876
DRUG/NARCOTIC,53971


In [9]:
# Visualize the above content using plotly library.
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

category_count.iplot(kind='bar', theme='white')
# Further, use F1-score metric to my modeling for class imbalance!

In [10]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [13]:
# What the highest day of crime occurence in San francisco?
day_count = train.groupby('DayOfWeek').agg({'Dates':'count'}).rename(columns={'Dates':'Count'})
day_count = day_count.sort_values(by='Count', ascending=False)
day_count.head()

Unnamed: 0_level_0,Count
DayOfWeek,Unnamed: 1_level_1
Friday,133734
Wednesday,129211
Saturday,126810
Thursday,125038
Tuesday,124965


In [14]:
# Visualize the above dataframe.
day_count.iplot(kind='bar', theme='white')

# The difference in all day of week is somewhat a little.
# So when giving weight to the variable, don't just assign the ordinal number. 
# Give attention to the scaling of weight!

In [None]:
# PdDistrict별로 얼마나 사건발생했는지 살펴보기 부터..!