# 3.2.1 Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

## 加载数据

In [2]:
train_data=pd.read_csv('train.csv.zip')
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
test_data=pd.read_csv('test.csv.zip')
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


## 清洗数据
### 填补无效数据

In [4]:
train_data.drop_duplicates(inplace=True)
train_data.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
test_data.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

for district in train_data['PdDistrict'].unique():
    train_data.loc[train_data['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(
        train_data.loc[train_data['PdDistrict'] == district, ['X', 'Y']])
    test_data.loc[test_data['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        test_data.loc[test_data['PdDistrict'] == district, ['X', 'Y']])

### 删除无关特征

In [5]:
train_data.drop(['Descript','Resolution'],inplace=True,axis=1)
train_data.drop('Address',inplace=True,axis=1)
test_data.drop('Address',inplace=True,axis=1)
test_data.drop('Id',inplace=True,axis=1)

### 对 PdDistrict 特征独热编码

In [6]:
dummies_train=pd.get_dummies(train_data['PdDistrict'])
dummies_test=pd.get_dummies(test_data['PdDistrict'])

train_data=pd.concat([train_data,dummies_train],axis=1)
train_data.drop('PdDistrict',inplace=True,axis=1)

test_data=pd.concat([test_data,dummies_test],axis=1)
test_data.drop('PdDistrict',inplace=True,axis=1)

### 替换星期

In [7]:
train_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)
test_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)

### 替换日期与时分

In [8]:
ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][:4:]
train_data['Year']=ls1 

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][5:7:]
train_data['Month']=ls1 

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][8:10:]
train_data['Day']=ls1 

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][11:-6:]
train_data['Hours']=ls1

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][-5:-3:]
train_data['Minutes']=ls1

train_data.drop('Dates',axis=1,inplace=True)

In [9]:
ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][:4:]
test_data['Year']=ls1 

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][5:7:]
test_data['Month']=ls1 

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][8:10:]
test_data['Day']=ls1 

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][11:-6:]
test_data['Hours']=ls1

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][-5:-3:]
test_data['Minutes']=ls1

test_data.drop('Dates',axis=1,inplace=True)

### 对分类结果进行独热编码

In [10]:
y=pd.get_dummies(train_data['Category'])
train_data.drop('Category',inplace=True,axis=1)
y.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_data.head()

Unnamed: 0,DayOfWeek,X,Y,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,Year,Month,Day,Hours,Minutes
0,2,-122.425892,37.774599,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,53
1,2,-122.425892,37.774599,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,53
2,2,-122.424363,37.800414,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,33
3,2,-122.426995,37.800873,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,30
4,2,-122.438738,37.771541,0,0,0,0,0,1,0,0,0,0,2015,5,13,23,30


In [12]:
test_data.head()

Unnamed: 0,DayOfWeek,X,Y,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,Year,Month,Day,Hours,Minutes
0,6,-122.399588,37.735051,1,0,0,0,0,0,0,0,0,0,2015,5,10,23,59
1,6,-122.391523,37.732432,1,0,0,0,0,0,0,0,0,0,2015,5,10,23,51
2,6,-122.426002,37.792212,0,0,0,0,1,0,0,0,0,0,2015,5,10,23,50
3,6,-122.437394,37.721412,0,0,1,0,0,0,0,0,0,0,2015,5,10,23,45
4,6,-122.437394,37.721412,0,0,1,0,0,0,0,0,0,0,2015,5,10,23,45


### 最终训练集与分类

In [13]:
X=train_data.astype(float)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(788153, 18) (788153, 39)
(87573, 18) (87573, 39)


## 构建网络
### 结构

In [14]:
model=Sequential()
model.add(Dense(128,input_shape=(X.shape[1],)))
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(39,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               2432      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_6 (Dense)              (None, 39)                2

### 参数与训练结果

In [15]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [16]:
train=model.fit(X_train,y_train, 
         batch_size=32,
         epochs=20,
         verbose=2,
         validation_data=(X_test,y_test))

Epoch 1/20
24630/24630 - 38s - loss: 2.6556 - accuracy: 0.2157 - val_loss: 2.6149 - val_accuracy: 0.2232
Epoch 2/20
24630/24630 - 37s - loss: 2.5970 - accuracy: 0.2288 - val_loss: 2.6254 - val_accuracy: 0.2134
Epoch 3/20
24630/24630 - 36s - loss: 2.5891 - accuracy: 0.2306 - val_loss: 2.5979 - val_accuracy: 0.2255
Epoch 4/20
24630/24630 - 36s - loss: 2.5803 - accuracy: 0.2344 - val_loss: 2.5657 - val_accuracy: 0.2325
Epoch 5/20
24630/24630 - 37s - loss: 2.5752 - accuracy: 0.2355 - val_loss: 2.5645 - val_accuracy: 0.2376
Epoch 6/20
24630/24630 - 37s - loss: 2.5646 - accuracy: 0.2392 - val_loss: 2.5571 - val_accuracy: 0.2385
Epoch 7/20
24630/24630 - 37s - loss: 2.5686 - accuracy: 0.2366 - val_loss: 2.5783 - val_accuracy: 0.2322
Epoch 8/20
24630/24630 - 37s - loss: 2.5883 - accuracy: 0.2295 - val_loss: 2.5928 - val_accuracy: 0.2288
Epoch 9/20
24630/24630 - 37s - loss: 2.5911 - accuracy: 0.2306 - val_loss: 2.5897 - val_accuracy: 0.2306
Epoch 10/20
24630/24630 - 37s - loss: 2.5886 - accuracy

## 预测

In [17]:
test=test_data.astype(float)
pred=model.predict(test)
pred

array([[3.91135551e-03, 1.23311199e-01, 7.86129312e-05, ...,
        2.75240112e-02, 7.27792382e-02, 2.51158308e-02],
       [4.44176095e-03, 1.21520147e-01, 2.37505636e-04, ...,
        3.25555243e-02, 6.78321645e-02, 2.34932303e-02],
       [1.44436827e-03, 9.84335914e-02, 3.85405350e-04, ...,
        3.04364841e-02, 6.35737702e-02, 9.95170791e-03],
       ...,
       [1.13601377e-03, 7.83879086e-02, 1.14152860e-03, ...,
        1.22562118e-01, 1.11570973e-02, 4.80509596e-03],
       [1.14574388e-03, 7.77007341e-02, 7.91312661e-04, ...,
        1.40558496e-01, 1.06269922e-02, 4.80582472e-03],
       [1.04876026e-03, 7.74630085e-02, 1.29782781e-03, ...,
        1.06226034e-01, 1.15413424e-02, 4.47238842e-03]], dtype=float32)

In [18]:
m = np.max(pred, axis=1).reshape(-1, 1)
predicted = np.array((pred == m), dtype='int32')
predicted

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
sample=pd.read_csv('sampleSubmission.csv.zip')
col_names=list(sample.columns)
col_names.remove('Id')

In [20]:
submission = pd.DataFrame()
for i, entry in enumerate(col_names):
    submission[entry] = predicted[:,i]
submission.to_csv('submission_original.csv', index=False)
submission.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
