In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import argparse
import sys
import os

import csv
import zipfile
import xml.etree.ElementTree as ET

from sklearn import metrics
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
def convert_to_csv(path, name, directory):
    """Read XML dataset"""
    with zipfile.ZipFile(path, mode='r') as z:
        with z.open([filename for filename in z.namelist() if filename[-4:] == '.xml'][0]) as f:
            tree = ET.parse(f)
            root = tree.getroot()
            
            filename = os.path.join(directory, name + '.csv')

            with open(filename, 'wt', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile, delimiter=',')

                for Question in root:
                    QID = int(Question.get('QID'))
                    Qtext = Question.find('Qtext').text

                    for QApair in Question.iter('QApair'): 
                        QAID = int(QApair.get('QAID'))
                        QArel = QApair.get('QArel')
                        QAconf = float(QApair.get('QAconf'))
                        QAquestion = QApair.find('QAquestion').text
                        QAanswer = QApair.find('QAanswer').text
                        
                        writer.writerow([QID, Qtext, QAID, QArel, QAconf, QAquestion, QAanswer])

                    

In [None]:
convert_to_csv('D:\PhD\SemEval\TRAIN\SemEval2016-Task3-CQA-MD-train.xml.zip', 'train', 'temp')

In [None]:
MAX_DOCUMENT_LENGTH = 10
EMBEDDING_SIZE = 50
n_words = 0
MAX_LABEL = 15
WORDS_FEATURE = 'words'  # Name of the input words feature.

In [None]:
def estimator_spec_for_softmax_classification(
    logits, labels, mode):
    """Returns EstimatorSpec instance for softmax classification."""
    predicted_classes = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'class': predicted_classes,
                'prob': tf.nn.softmax(logits)
            })

    onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
    loss = tf.losses.softmax_cross_entropy(
      onehot_labels=onehot_labels, logits=logits)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    eval_metric_ops = {
      'accuracy': tf.metrics.accuracy(
          labels=labels, predictions=predicted_classes)
    }
    
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


In [None]:
def bag_of_words_model(features, labels, mode):
    """A bag-of-words model. Note it disregards the word order in the text."""
    bow_column = tf.feature_column.categorical_column_with_identity(
      WORDS_FEATURE, num_buckets=n_words)
    bow_embedding_column = tf.feature_column.embedding_column(
      bow_column, dimension=EMBEDDING_SIZE)
    bow = tf.feature_column.input_layer(
      features,
      feature_columns=[bow_embedding_column])
    logits = tf.layers.dense(bow, MAX_LABEL, activation=None)

    return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)

In [None]:
def rnn_model(features, labels, mode):
    """RNN model to predict from sequence of words to a class."""
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    word_vectors = tf.contrib.layers.embed_sequence(
      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)

    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = tf.unstack(word_vectors, axis=1)

    # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
    cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)

    # Create an unrolled Recurrent Neural Networks to length of
    # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
    _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)

    # Given encoding of RNN, take encoding of last step (e.g hidden size of the
    # neural network of last step) and pass it as features for softmax
    # classification over output classes.
    logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
    return estimator_spec_for_softmax_classification(
      logits=logits, labels=labels, mode=mode)

In [None]:
training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
    filename='temp/train.csv',
    features_dtype=np.str,
    target_dtype=np.float32,
    target_column=4)

x_train = pd.Series(training_set.data[:,1])
y_train = pd.Series(training_set.target)

In [None]:
# Process vocabulary
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
  MAX_DOCUMENT_LENGTH)

x_transform_train = vocab_processor.fit_transform(x_train)
x_train = np.array(list(x_transform_train))

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)