# Text Classification
In this notebook we perform text classification to an unknown testing dataset,
containing almost 50K documents using Deep Learning approaches.

In more details in this notebook we transform the texts into word εmbeddings vectors and then fit them
to a CNN. The embeddings vectors are produced using keras Embedding layer.

In [None]:
from keras.preprocessing.text import one_hot
from keras_preprocessing.sequence import pad_sequences
import keras as K
import pandas as pd
import time
import tensorflow as tf
tf.debugging.set_log_device_placement(True)

In [0]:
train_path = "files/data/train.csv"
predicitions_path = 'files/data/predictions.csv'
test_path = "files/data/test_without_labels.csv"

## Colab Variables

In [None]:
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

train_path = '/content/drive/My Drive/Colab Notebooks/BDA/text_classification/train.csv'
test_path = '/content/drive/My Drive/Colab Notebooks/BDA/text_classification/test_without_labels.csv'
predicitions_path = '/content/drive/My Drive/Colab Notebooks/BDA/text_classification/predictions.csv'

from google.colab import drive
drive.mount('/content/drive')

## Loading Training Set

In [None]:
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv(train_path)
X = (train['Title']+ " ")*5 + train['Content']
X = X.values.tolist()
y = train['Label']

enc = OneHotEncoder(sparse=False)
enc_y = enc.fit_transform(y.values.reshape((-1,1)))
print("No Documents: " + str(len(X)))

## Pre-process using Lemmatization

Applying Lemmatization using position tags. We use position tags in order to enable lemmatization, 
not only to nouns but also to all other parts of speech. Also removing stopwords, punctuations and non alpha characters.

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess

lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

my_stopwords = ENGLISH_STOP_WORDS.union(stopwords.words('english'))\
    .union(['include', 'way', 'work', 'look', 'add', 'time', 'year', 'month', 'day', 'help', 'think', 'tell', 'new', 'said', 'say','need', 'come', 'good', 'set', 'want', 'people', 'use', 'day', 'week', 'know'])

my_stopwords_lemma = set()
for word, nltk_tag in nltk.pos_tag(my_stopwords):
    tag = nltk2wn_tag(nltk_tag)
    if tag is not None:
        my_stopwords_lemma.add(lmtzr.lemmatize(word, tag))
    else:
        my_stopwords_lemma.add(word)
        

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                lemma = lmtzr.lemmatize(word, tag)
                if lemma not in my_stopwords_lemma:
                    clean_doc.append(lemma)
            else:
                if word not in my_stopwords:
                    clean_doc.append(word)
        new_documents.append(clean_doc)
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents

In [10]:
clean_X = documents_preprocess(X)

Text Preprocessing took: 200.06799244880676


## Embedding Configuration
Computing the size of the vocabulary and the max length of the documents. Then, pad all the documents,
in order to reach the max lenght

In [None]:
words_set = set()
sentence_max_words = max([len(doc) for doc in clean_X])
for doc in clean_X: words_set.update(doc)

vocab_length = len(words_set)
print("Dictionary size: " + str(vocab_length) + "\nMax words per sentence: " + str(sentence_max_words))

embedded_doc = [one_hot(" ".join(doc), vocab_length) for doc in clean_X]
padded_doc = pad_sequences(embedded_doc, sentence_max_words, padding='post')
padded_doc.shape

## E+CNN Architecture
The model consists of the following layers:
- An Embedding Layer, which will transform the input documents into embedding vectors
- Two 1D Convolution layers followed by max pooling layers
- Two Dense layers and an output layer

In [None]:
embedding_dim = 80
es = K.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=3)


def create_model(summary=False):
  model = K.models.Sequential([
      K.layers.Embedding(vocab_length+1, embedding_dim, input_length=sentence_max_words),
      K.layers.Conv1D(64, 16, padding='valid', activation='relu', strides=2, name="cov1"),
      K.layers.MaxPool1D(), 
      K.layers.Conv1D(64, 16, padding='same', activation='relu', name="cov2"),
      K.layers.GlobalMaxPooling1D(),
      K.layers.Dropout(0.2),
      K.layers.Dense(32, activation='relu'),
      K.layers.Dense(16, activation='relu'),
      K.layers.Dense(4, activation='softmax')
  ])

  model.compile(optimizer=K.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
  if summary: 
    model.summary()
  return model

create_model(True)

## Model Evaluation
Perform evaluation using k-Fold Cross Validation. In each iteration the model is constructed
 from scratch.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

starting_tm = time.time()
precision = 0
recall = 0
f1 = 0
accuracy = 0

k = 5
epochs = 5
skf = StratifiedKFold(n_splits=k)
for train_index, test_index in skf.split(padded_doc, y):
    
    X_train, X_test = padded_doc[train_index], padded_doc[test_index]
    y_train, y_test = enc_y[train_index], enc_y[test_index]
    
    model = create_model()
    model.fit(X_train, y_train, epochs=epochs, batch_size=256)
    predictions = model.predict(X_test)
    
    precision += metrics.precision_score(y_test, predictions, average='micro')
    recall += metrics.recall_score(y_test, predictions, average='micro')
    f1 += metrics.f1_score(y_test, predictions, average='micro')
    accuracy += metrics.accuracy_score(y_test, predictions)

    print()

 # compute the average of each value
precision_score = precision/k
recall_score = recall/k
f1_score = f1/k
accuracy_score = accuracy/k

print("Precision: " + str(precision_score)
      + "\nRecall: " + str(recall_score)
      + "\nF1-Measure: " + str(f1_score) 
      + "\nAccuracy: " + str(accuracy_score)
      + "\nExecution time: " + str(time.time() - starting_tm))



## Prediction
Load the testing dataset, pre-process it and then predict it. In the end store the results as CSV.

In [None]:
test = pd.read_csv(test_path)
X_test = (test['Title']+ " ")*5 + test['Content']

test_y = pd.get_dummies(test['Label']).values

test_clean_X = documents_preprocess(X_test)

test_embedded_doc = [one_hot(" ".join(doc), vocab_length) for doc in test_clean_X]
test_padded_doc = pad_sequences(test_embedded_doc, sentence_max_words, padding='post')
print(test_padded_doc.shape)

labels_dict = {[0,1,0,0]: "Entertainment", [1,0,0,0]: 'Business', [0,0,1,0]:'Health', [0,0,0,1]:'Technology'}

In [0]:
model = create_model(True)
model.fit(padded_doc, y,epochs=10, batch_size=64, callbacks=[es])

predictions = model.predict(test_padded_doc)
predictions = [labels_dict(prediction) for prediction in predictions]


predictions_df = pd.DataFrame(data={'Id': list(train['Id']), 'Predicted':predictions})
predictions_df.to_csv(predicitions_path, index=False)