In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [17]:
import argparse
import os
import sys
import zipfile
import csv

In [4]:
import tensorflow as tf
import numpy as np

In [5]:
import xml.etree.ElementTree as ET

In [6]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


In [7]:
def convert_to(path, name, directory):
    """Read XML dataset"""
    with zipfile.ZipFile(path, mode='r') as z:
        with z.open([filename for filename in z.namelist() if filename[-4:] == '.xml'][0]) as f:
            tree = ET.parse(f)
            root = tree.getroot()
            
            filename = os.path.join(directory, name + '.tfrecords')
            print('Writing', filename)
            writer = tf.python_io.TFRecordWriter(filename)
            
            for Question in root:
                QID = int(Question.get('QID'))
                Qtext = Question.find('Qtext').text

                for QApair in Question.iter('QApair'): 
                    QAID = int(QApair.get('QAID'))
                    QArel = QApair.get('QArel')
                    QAconf = float(QApair.get('QAconf'))
                    QAquestion = QApair.find('QAquestion').text
                    QAanswer = QApair.find('QAanswer').text

                    csv.
                    example = tf.train.Example(features=tf.train.Features(feature={
                        'QID': _int64_feature(QID),
                        'Qtext': _bytes_feature(tf.compat.as_bytes(Qtext)),
                        'QAID': _int64_feature(QAID),
                        'QArel': _bytes_feature(tf.compat.as_bytes(QArel)),
                        'QAconf': _float_feature(QAconf),
                        'QAquestion': _bytes_feature(tf.compat.as_bytes(QAquestion)),
                        'QAanswer': _bytes_feature(tf.compat.as_bytes(QAanswer))}))

                    writer.write(example.SerializeToString())

            writer.close()


In [8]:
directory = 'temp'
name = 'train'

if not os.path.exists(os.path.join(directory, name + '.tfrecords')):
    convert_to('D:\PhD\SemEval\TRAIN\SemEval2016-Task3-CQA-MD-train.xml.zip', name, directory)

In [9]:
MAX_DOCUMENT_LENGTH = 10
EMBEDDING_SIZE = 50
n_words = 0
MAX_LABEL = 15
WORDS_FEATURE = 'QAquestion'  # Name of the input words feature.

In [10]:
def estimator_spec_for_softmax_classification(
    logits, labels, mode):
    """Returns EstimatorSpec instance for softmax classification."""
    predicted_classes = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'class': predicted_classes,
                'prob': tf.nn.softmax(logits)
            })

    onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
    loss = tf.losses.softmax_cross_entropy(
      onehot_labels=onehot_labels, logits=logits)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    eval_metric_ops = {
      'accuracy': tf.metrics.accuracy(
          labels=labels, predictions=predicted_classes)
    }
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


In [11]:
def bag_of_words_model(features, labels, mode):
    """A bag-of-words model. Note it disregards the word order in the text."""
    bow_column = tf.feature_column.categorical_column_with_identity(
      WORDS_FEATURE, num_buckets=n_words)
    bow_embedding_column = tf.feature_column.embedding_column(
      bow_column, dimension=EMBEDDING_SIZE)
    bow = tf.feature_column.input_layer(
      features,
      feature_columns=[bow_embedding_column])
    logits = tf.layers.dense(bow, MAX_LABEL, activation=None)

    return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)

In [12]:
def rnn_model(features, labels, mode):
    """RNN model to predict from sequence of words to a class."""
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    word_vectors = tf.contrib.layers.embed_sequence(
      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)

    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = tf.unstack(word_vectors, axis=1)

    # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
    cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)

    # Create an unrolled Recurrent Neural Networks to length of
    # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
    _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)

    # Given encoding of RNN, take encoding of last step (e.g hidden size of the
    # neural network of last step) and pass it as features for softmax
    # classification over output classes.
    logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
    return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)

In [13]:
def _parse_function(example_proto):
    features = {}
    parsed_features = tf.parse_single_example(example_proto, features)
    return parsed_features["QID"], tf.cast(parsed_features["Qtext"], tf.string), parsed_features["QAID"], parsed_features["QArel"], parsed_features["QAconf"], parsed_features["QAquestion"], parsed_features["QAanswer"]

In [14]:
def dataset_input_fn():
    filenames = ["temp\train.tfrecords"]
    dataset = tf.contrib.data.TFRecordDataset(filenames)

    # Use `tf.parse_single_example()` to extract data from a `tf.Example`
    # protocol buffer, and perform any additional per-record preprocessing.
    def parser(record):
        keys_to_features = {
            "QID": tf.FixedLenFeature((), tf.int64),
            "Qtext": tf.FixedLenFeature((), tf.string),
            "QAID": tf.FixedLenFeature((), tf.int64),
            "QArel": tf.FixedLenFeature((), tf.string),
            "QAconf": tf.FixedLenFeature((), tf.float32),
            "QAquestion": tf.FixedLenFeature((), tf.string),
            "QAanswer": tf.FixedLenFeature((), tf.string)
        }
        parsed = tf.parse_single_example(record, keys_to_features)

        # Perform additional preprocessing on the parsed data.
        #image = tf.decode_jpeg(parsed["image_data"])
        #image = tf.reshape(image, [299, 299, 1])
        #label = tf.cast(parsed["label"], tf.int32)
        
        QID = parsed["QID"]
        Qtext = parsed["Qtext"]
        QAID = parsed["QAID"]
        QArel = parsed["QArel"]
        QAconf = parsed["QAconf"]
        QAquestion = parsed["QAquestion"]
        QAanswer = parsed["QAanswer"]
        
        

        return {
            "QID": QID,
            "Qtext": Qtext,
            "QAID": QAID,
            "QArel": QArel,
            "QAquestion": QAquestion,
            "QAanswer": QAanswer}, QAconf

    # Use `Dataset.map()` to build a pair of a feature dictionary and a label
    # tensor for each example.
    dataset = dataset.map(parser)
    
    #dataset = dataset.shuffle(buffer_size=10000)
    #dataset = dataset.batch(32)
    #dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()

    # `features` is a dictionary in which each value is a batch of values for
    # that feature; `labels` is a batch of labels.
    features, labels = iterator.get_next()
    
    
    return features, labels

In [16]:
feature_columns = [
      tf.feature_column.categorical_column_with_vocabulary_file('QAquestion', shape=[1])]
regressor = tf.estimator.DNNRegressor(
  feature_columns=feature_columns, hidden_units=[10, 10])
regressor.train(input_fn=dataset_input_fn, steps=100)

TypeError: categorical_column_with_vocabulary_file() got an unexpected keyword argument 'shape'

In [None]:
tf.contrib.learn.datasets.base.load_csv_without_header()

In [1]:
import tensorflow as tf

In [None]:
tf.