In [1]:
from nltk.corpus import reuters
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Dense, Dropout
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
fids_train = [fid for fid in reuters.fileids() if fid.startswith('train')]
fids_test = [fid for fid in reuters.fileids() if fid.startswith('test')]

tfidf = TfidfVectorizer().fit(reuters.raw(fid) for fid in fids_train)
X_train = tfidf.transform(reuters.raw(fid) for fid in fids_train)
X_test = tfidf.transform(reuters.raw(fid) for fid in fids_test)

mlb = MultiLabelBinarizer().fit(reuters.categories(fid) for fid in fids_train)
y_train = mlb.transform(reuters.categories(fid) for fid in fids_train)
y_test = mlb.transform(reuters.categories(fid) for fid in fids_test)

In [3]:
model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(mlb.classes_.shape[0], activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               6728704   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 90)                23130     
Total params: 6,817,626
Trainable params: 6,817,626
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
model.fit(X_train, y_train, shuffle=True, batch_size=32, epochs=10, validation_split=0.1)
y_test_pred = model.predict(X_test, verbose=1)

Train on 6992 samples, validate on 777 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [5]:
# Summary interpretation (i.e. for gold):
# - precision: 95% of cases labeled gold are actually about gold
# - recall: 70% of the gold labeled (actual) articles where classified as gold
print(classification_report(y_test, y_test_pred>0.5, target_names=mlb.classes_))

                 precision    recall  f1-score   support

            acq       0.96      0.97      0.97       719
           alum       1.00      0.13      0.23        23
         barley       1.00      0.43      0.60        14
            bop       0.94      0.50      0.65        30
        carcass       1.00      0.11      0.20        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       1.00      0.72      0.84        18
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.96      0.93      0.95        28
         copper       0.91      0.56      0.69        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.83      0.79      0.81        56
         cotton       1.00      0.10      0.18        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.86      0.43      0.57        28
            cpu       0.00    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
