## TensorFlow with word embeddings to classify the [Ten Thousand German News Articles Dataset](https://github.com/tblock/10kGNAD)
This Notebook contains the code to reproduce the results in my thesis.

Run all cells consecutively.

Since the code is based on the [TF-Hub text classification tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub), the original copyright notice is included.


##### Copyright 2018 The TensorFlow Hub Authors.

Licensed under the Apache License, Version 2.0 (the "License");

In [0]:
# Copyright 2018 The TensorFlow Hub Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

### Enviroment Setup 

In [0]:
# load the dataset and generate subsets
!rm -rf 10kGNAD lowshot
!git config --global advice.detachedHead false
!git clone -q --branch v1.1 https://github.com/tblock/10kGNAD.git && echo "downloaded dataset"
!mkdir lowshot
!cp 10kGNAD/train.csv .
!python 10kGNAD/code/generate_lowshot_sets.py > /dev/null && echo "generated train subsets"

downloaded dataset
generated train subsets


In [0]:
!pip install --quiet tensorflow>=1.7 tensorflow-hub



In [0]:
import os
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'
tf.logging.set_verbosity(tf.logging.ERROR) # Reduce logging output
tf.set_random_seed(42) 
np.random.seed(42) # set seeds for reproducabel results
tf.reset_default_graph()

### Train Models

In [0]:
class_names = ['Etat', 'Inland', 'International', 'Kultur', 'Panorama', 'Sport', 'Web', 'Wirtschaft', 'Wissenschaft']
def label_to_int(label_list):
  label_list_as_int = []
  for entry in label_list:
    label_list_as_int.append(class_names.index(entry))
  return label_list_as_int

In [0]:
# load test set and create input function
test_df = pd.read_csv('10kGNAD/test.csv', header=None, sep=';', quotechar="'", names=['label', 'text'])   # columns names if no header
test_df = test_df.assign(label_int = label_to_int(test_df['label']))
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(test_df, test_df["label_int"], shuffle=False)

In [0]:
def train_and_evaluate_with_module(hub_module, train_input_fn, train_module=True):
  
    embedded_text_feature_column = hub.text_embedding_column(key="text", module_spec=hub_module, trainable=train_module)

    estimator = tf.estimator.DNNClassifier(
        hidden_units=[500, 100],
        feature_columns=[embedded_text_feature_column],
        n_classes=9,
        optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.035,
                                                    l1_regularization_strength=0.001,
                                                    l2_regularization_strength=0.001)
    )
    
    estimator.train(input_fn=train_input_fn, steps=1000)
    test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
    test_set_accuracy = test_eval_result["accuracy"]
    
    return test_set_accuracy

In [0]:
filenames = sorted(glob.glob("lowshot/*.csv"))

for filename in filenames:
  
    # load testset and create input function
    train_df = pd.read_csv(filename, header=None, sep=';', quotechar="'", names=['label', 'text'])   # columns names if no header
    train_df = train_df.assign(label_int = label_to_int(train_df['label']))
    train_input_fn = tf.estimator.inputs.pandas_input_fn(train_df, train_df["label_int"], num_epochs=None, shuffle=True)
    
    acc = train_and_evaluate_with_module("https://tfhub.dev/google/nnlm-de-dim128-with-normalization/1", train_input_fn, True)
    
    print(filename,"%.2f" % float((100 - acc*100)), sep="\t")
    
    !rm -rf /tmp/tmp* # remove all tmp/tmp* folders since the old trained modells fill up the disk space

lowshot/lowshot_0_0.01_0.csv	33.17
lowshot/lowshot_0_0.01_1.csv	28.40
lowshot/lowshot_0_0.01_2.csv	33.07
lowshot/lowshot_0_0.01_3.csv	29.67
lowshot/lowshot_0_0.01_4.csv	29.47
lowshot/lowshot_0_0.01_5.csv	29.09
lowshot/lowshot_0_0.01_6.csv	29.77
lowshot/lowshot_0_0.01_7.csv	25.88
lowshot/lowshot_0_0.01_8.csv	27.92
lowshot/lowshot_0_0.01_9.csv	28.99
lowshot/lowshot_1_0.02_0.csv	25.29
lowshot/lowshot_1_0.02_1.csv	25.00
lowshot/lowshot_1_0.02_2.csv	24.03
lowshot/lowshot_1_0.02_3.csv	21.50
lowshot/lowshot_1_0.02_4.csv	23.54
lowshot/lowshot_1_0.02_5.csv	23.93
lowshot/lowshot_1_0.02_6.csv	27.14
lowshot/lowshot_1_0.02_7.csv	23.54
lowshot/lowshot_1_0.02_8.csv	24.61
lowshot/lowshot_1_0.02_9.csv	23.44
lowshot/lowshot_2_0.05_0.csv	21.30
lowshot/lowshot_2_0.05_1.csv	21.60
lowshot/lowshot_2_0.05_2.csv	19.65
lowshot/lowshot_2_0.05_3.csv	20.82
lowshot/lowshot_2_0.05_4.csv	20.04
lowshot/lowshot_2_0.05_5.csv	19.46
lowshot/lowshot_2_0.05_6.csv	21.60
lowshot/lowshot_2_0.05_7.csv	21.11
lowshot/lowshot_2_0.