# 3.2.1 Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector

## 加载数据

In [2]:
train_data=pd.read_csv('train.csv.zip')
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
test_data=pd.read_csv('test.csv.zip')
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


## 清洗数据
### 填补无效数据

In [4]:
train_data.drop_duplicates(inplace=True)
train_data.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
test_data.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

for district in train_data['PdDistrict'].unique():
    train_data.loc[train_data['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(
        train_data.loc[train_data['PdDistrict'] == district, ['X', 'Y']])
    test_data.loc[test_data['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        test_data.loc[test_data['PdDistrict'] == district, ['X', 'Y']])

### 删除无关特征

In [5]:
train_data.drop(['Descript','Resolution'],inplace=True,axis=1)
train_data.drop('Address',inplace=True,axis=1)
test_data.drop('Address',inplace=True,axis=1)
test_data.drop('Id',inplace=True,axis=1)

### 对 PdDistrict 特征独热编码

In [6]:
dummies_train=pd.get_dummies(train_data['PdDistrict'])
dummies_test=pd.get_dummies(test_data['PdDistrict'])

train_data=pd.concat([train_data,dummies_train],axis=1)
train_data.drop('PdDistrict',inplace=True,axis=1)

test_data=pd.concat([test_data,dummies_test],axis=1)
test_data.drop('PdDistrict',inplace=True,axis=1)

### 替换星期

In [7]:
train_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)
test_data['DayOfWeek'].replace(to_replace=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],value=[i for i in range(0,7)],inplace=True)

### 替换日期与时分

In [8]:
ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][:4:]
train_data['Year']=ls1 

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][5:7:]
train_data['Month']=ls1 

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][8:10:]
train_data['Day']=ls1 

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][11:-6:]
train_data['Hours']=ls1

ls1=list(train_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][-5:-3:]
train_data['Minutes']=ls1

train_data.drop('Dates',axis=1,inplace=True)

In [9]:
ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][:4:]
test_data['Year']=ls1 

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][5:7:]
test_data['Month']=ls1 

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][8:10:]
test_data['Day']=ls1 

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][11:-6:]
test_data['Hours']=ls1

ls1=list(test_data['Dates'])
for i in range(len(ls1)):
    ls1[i]=ls1[i][-5:-3:]
test_data['Minutes']=ls1

test_data.drop('Dates',axis=1,inplace=True)

### 对分类结果进行独热编码

In [10]:
y_origin=pd.get_dummies(train_data['Category'])
train_data.drop('Category',inplace=True,axis=1)
y_origin.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_data.head()

Unnamed: 0,DayOfWeek,X,Y,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,Year,Month,Day,Hours,Minutes
0,2,-122.425892,37.774599,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,53
1,2,-122.425892,37.774599,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,53
2,2,-122.424363,37.800414,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,33
3,2,-122.426995,37.800873,0,0,0,0,1,0,0,0,0,0,2015,5,13,23,30
4,2,-122.438738,37.771541,0,0,0,0,0,1,0,0,0,0,2015,5,13,23,30


In [12]:
test_data.head()

Unnamed: 0,DayOfWeek,X,Y,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN,Year,Month,Day,Hours,Minutes
0,6,-122.399588,37.735051,1,0,0,0,0,0,0,0,0,0,2015,5,10,23,59
1,6,-122.391523,37.732432,1,0,0,0,0,0,0,0,0,0,2015,5,10,23,51
2,6,-122.426002,37.792212,0,0,0,0,1,0,0,0,0,0,2015,5,10,23,50
3,6,-122.437394,37.721412,0,0,1,0,0,0,0,0,0,0,2015,5,10,23,45
4,6,-122.437394,37.721412,0,0,1,0,0,0,0,0,0,0,2015,5,10,23,45


## 特征工程
### 对分类结果编码

In [13]:
X_origin=train_data.astype(float)
y_select=y_origin.to_numpy()*np.matrix([range(1,len(y_origin.columns)+1)]).T
y_select

matrix([[38],
        [22],
        [22],
        ...,
        [17],
        [36],
        [13]])

### 使用NNI进行特征工程

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X_origin.copy(),y_select,test_size=0.1,random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(788153, 18) (788153, 1)
(87573, 18) (87573, 1)


In [15]:
fgs = FeatureGradientSelector(n_features=14,classification=True,batch_size=731225)
fgs.fit(X_train, y_train)
res=fgs.get_selected_features()
res

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 14, 16, 17],
      dtype=int64)

### 最终训练集与分类

In [16]:
X=X_origin.iloc[:,list(res)]
y=y_origin
X.head()

Unnamed: 0,DayOfWeek,X,Y,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,TARAVAL,Month,Hours,Minutes
0,2.0,-122.425892,37.774599,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,23.0,53.0
1,2.0,-122.425892,37.774599,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,23.0,53.0
2,2.0,-122.424363,37.800414,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,23.0,33.0
3,2.0,-122.426995,37.800873,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,23.0,30.0
4,2.0,-122.438738,37.771541,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,23.0,30.0


In [17]:
y.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(788153, 14) (788153, 39)
(87573, 14) (87573, 39)


## 构建网络
### 结构

In [19]:
model=Sequential()
model.add(Dense(128,input_shape=(X.shape[1],)))
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(39,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1920      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_6 (Dense)              (None, 39)                2

### 参数与训练结果

In [20]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [21]:
train=model.fit(X_train,y_train, 
         batch_size=32,
         epochs=20,
         verbose=2,
         validation_data=(X_test,y_test))

Epoch 1/20
24630/24630 - 34s - loss: 2.5795 - accuracy: 0.2332 - val_loss: 2.5569 - val_accuracy: 0.2387
Epoch 2/20
24630/24630 - 33s - loss: 2.5345 - accuracy: 0.2417 - val_loss: 2.5344 - val_accuracy: 0.2380
Epoch 3/20
24630/24630 - 32s - loss: 2.5277 - accuracy: 0.2424 - val_loss: 2.5181 - val_accuracy: 0.2424
Epoch 4/20
24630/24630 - 34s - loss: 2.5303 - accuracy: 0.2400 - val_loss: 2.5215 - val_accuracy: 0.2408
Epoch 5/20
24630/24630 - 32s - loss: 2.5266 - accuracy: 0.2412 - val_loss: 2.5147 - val_accuracy: 0.2407
Epoch 6/20
24630/24630 - 40s - loss: 2.5274 - accuracy: 0.2403 - val_loss: 2.5150 - val_accuracy: 0.2405
Epoch 7/20
24630/24630 - 44s - loss: 2.5328 - accuracy: 0.2381 - val_loss: 2.5457 - val_accuracy: 0.2302
Epoch 8/20
24630/24630 - 46s - loss: 2.5327 - accuracy: 0.2385 - val_loss: 2.5230 - val_accuracy: 0.2375
Epoch 9/20
24630/24630 - 31s - loss: 2.5294 - accuracy: 0.2387 - val_loss: 2.5217 - val_accuracy: 0.2380
Epoch 10/20
24630/24630 - 29s - loss: 2.5279 - accuracy

## 预测

In [22]:
test=test_data.iloc[:,list(res)].astype(float)
pred=model.predict(test)
pred

array([[3.3072690e-03, 1.2992021e-01, 5.8769706e-06, ..., 3.9530918e-02,
        7.2954178e-02, 3.9255667e-02],
       [3.0926205e-03, 1.2886511e-01, 5.2753740e-06, ..., 3.8371887e-02,
        7.1838446e-02, 3.8308471e-02],
       [1.8100927e-03, 9.3128420e-02, 3.4170800e-05, ..., 3.1303577e-02,
        7.2571389e-02, 1.6426617e-02],
       ...,
       [1.3080098e-03, 7.0769452e-02, 1.8809896e-03, ..., 9.6306406e-02,
        6.0297442e-03, 3.9162422e-03],
       [1.0293981e-03, 7.1546793e-02, 1.7135743e-03, ..., 8.7514736e-02,
        6.3634319e-03, 3.6236104e-03],
       [1.1742589e-03, 6.8106867e-02, 2.1473588e-03, ..., 9.1838092e-02,
        5.8941874e-03, 3.8117815e-03]], dtype=float32)

In [23]:
m = np.max(pred, axis=1).reshape(-1, 1)
predicted = np.array((pred == m), dtype='int32')
predicted

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
sample=pd.read_csv('sampleSubmission.csv.zip')
col_names=list(sample.columns)
col_names.remove('Id')

In [25]:
submission = pd.DataFrame()
for i, entry in enumerate(col_names):
    submission[entry] = predicted[:,i]
submission.to_csv('submission_selected.csv', index=False)
submission.head()

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
