# [Implement Text Classification](https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/)

In [None]:
!pip install texthero

In [2]:
# text wrapping in colab
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Importing Libraries

In [3]:
import texthero as hero
import pandas as pd
import numpy as np
import xgboost
import textblob
import string as s
import random

from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [4]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Texthero 
* [Github](https://github.com/jbesomi/texthero)
* [Docs](https://texthero.org/)

## loading data

In [5]:
# import data
file_path = r"https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv"
df = pd.read_csv(file_path)
display(df.head())

Unnamed: 0,text,topic
0,Claxton hunting first major medal\n\nBritish h...,athletics
1,O'Sullivan could run in Worlds\n\nSonia O'Sull...,athletics
2,Greene sets sights on world title\n\nMaurice G...,athletics
3,IAAF launches fight against drugs\n\nThe IAAF ...,athletics
4,"Dibaba breaks 5,000m world record\n\nEthiopia'...",athletics


In [6]:
# make a copy of original dataframe
raw_df = df.copy()

## get distinct labels

In [7]:
# check values of labels
df['topic'].value_counts()

football     265
rugby        147
cricket      124
athletics    101
tennis       100
Name: topic, dtype: int64

In [8]:
# first row before prep
df['text'][0]

'Claxton hunting first major medal\n\nBritish hurdler Sarah Claxton is confident she can win her first major medal at next month\'s European Indoor Championships in Madrid.\n\nThe 25-year-old has already smashed the British record over 60m hurdles twice this season, setting a new mark of 7.96 seconds to win the AAAs title. "I am quite confident," said Claxton. "But I take each race as it comes. "As long as I keep up my training but not do too much I think there is a chance of a medal." Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage. Now, the Scotland-born athlete owns the equal fifth-fastest time in the world this year. And at last week\'s Birmingham Grand Prix, Claxton left European medal favourite Russian Irina Shevchenko trailing in sixth spot.\n\nFor the first time, Claxton has only been preparing for a campaign over the hurdles - which could explain her leap in form. In previous

In [9]:
# clean pipeline - https://texthero.org/docs/api/texthero.preprocessing.clean.html#texthero.preprocessing.clean
# first row after perp
hero.clean(df['text'])[0]

'claxton hunting first major medal british hurdler sarah claxton confident win first major medal next month european indoor championships madrid year old already smashed british record 60m hurdles twice season setting new mark seconds win aaas title quite confident said claxton take race comes long keep training much think chance medal claxton national 60m hurdles title past three years struggled translate domestic success international stage scotland born athlete owns equal fifth fastest time world year last week birmingham grand prix claxton left european medal favourite russian irina shevchenko trailing sixth spot first time claxton preparing campaign hurdles could explain leap form previous seasons year old also contested long jump since moving colchester london focused attentions claxton see new training regime pays dividends european indoors take place march'

## preprocessing

In [10]:
df['text'] = hero.clean(df['text'])
df.head()

Unnamed: 0,text,topic
0,claxton hunting first major medal british hurd...,athletics
1,sullivan could run worlds sonia sullivan indic...,athletics
2,greene sets sights world title maurice greene ...,athletics
3,iaaf launches fight drugs iaaf athletics world...,athletics
4,dibaba breaks 000m world record ethiopia tirun...,athletics


In [11]:
df2 = df.sample(frac=1) # shuffle dataset
df2.head()

Unnamed: 0,text,topic
269,chelsea denied james heroics brave defensive d...,football
349,hearts oak cotonsport hearts oak set ghanaian ...,football
51,holmes starts gb events kelly holmes start ser...,athletics
549,england coach faces rap row england coach andy...,rugby
703,officials respond court row australian tennis ...,tennis


## Stratified sampling

In [12]:
# first split data and labels
X = df2.pop('text')
display(X[:10])
y = df2.pop('topic')
print(y[:10])

269    chelsea denied james heroics brave defensive d...
349    hearts oak cotonsport hearts oak set ghanaian ...
51     holmes starts gb events kelly holmes start ser...
549    england coach faces rap row england coach andy...
703    officials respond court row australian tennis ...
248    parry relishes anfield challenge bbc sport ref...
113    england slump defeat fourth one day internatio...
222    ecb reveals county one day revamp england wale...
62     holmes urged compete worlds jolanda ceplak urg...
27     campbell extend sprint career darren campbell ...
Name: text, dtype: object

269     football
349     football
51     athletics
549        rugby
703       tennis
248     football
113      cricket
222      cricket
62     athletics
27     athletics
Name: topic, dtype: object


In [13]:
# use stratified sampling
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42, stratify=y)

print(f"X_train.shape - {X_train.shape}")
print(f"X_test.shape - {X_test.shape}")
print(f"y_train.shape - {y_train.shape}")
print(f"y_test.shape - {y_test.shape}")

X_train.shape - (442,)
X_test.shape - (295,)
y_train.shape - (442,)
y_test.shape - (295,)


In [14]:
y_train.value_counts()

football     159
rugby         88
cricket       74
athletics     61
tennis        60
Name: topic, dtype: int64

## label encoding

In [15]:
# label encoding
label_encoder = preprocessing.LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
print(f"y_train - {y_train}")

y_test = label_encoder.fit_transform(y_test)
print(f"y_test - {y_test}")

y_train - [0 1 0 2 4 3 1 3 2 2 3 4 0 2 2 0 2 4 2 0 2 2 0 3 2 2 1 2 4 2 0 2 2 2 2 2 2
 2 1 4 2 3 3 3 3 1 0 2 2 4 3 4 4 4 2 1 3 1 4 2 1 2 2 2 3 4 3 2 4 2 0 0 1 1
 2 1 3 4 4 4 0 4 2 3 1 1 3 4 2 2 2 2 2 1 0 2 1 3 0 3 0 0 3 2 3 1 2 3 2 2 2
 1 2 4 1 3 3 4 3 2 4 2 2 2 1 3 2 0 1 1 0 2 1 2 3 2 2 2 1 1 4 2 1 2 2 3 4 3
 3 1 0 4 3 2 0 2 2 2 2 0 1 2 1 1 3 2 3 0 4 4 2 2 3 2 0 0 3 2 1 1 1 2 2 2 1
 2 3 3 3 0 2 3 2 3 1 1 2 2 0 0 1 2 1 3 3 2 2 0 0 3 2 3 2 4 0 4 2 3 2 2 2 2
 2 3 2 2 3 4 1 1 4 4 2 2 4 3 2 2 0 4 0 0 1 2 0 2 3 4 1 2 4 1 0 4 4 1 2 2 4
 4 3 0 1 0 4 4 2 2 3 2 2 3 2 1 3 1 1 3 2 0 4 0 3 1 2 3 4 2 3 1 4 0 2 2 3 2
 3 0 0 2 4 4 2 3 0 0 3 3 1 4 4 2 2 1 3 2 2 0 3 0 3 1 0 2 2 1 0 3 2 1 3 2 4
 3 0 0 2 3 4 3 0 3 2 1 0 2 3 1 0 2 1 2 3 3 2 2 3 0 4 3 2 2 4 1 2 0 2 1 4 0
 2 4 1 3 2 3 2 2 2 1 1 4 1 2 0 4 2 2 3 3 3 2 3 1 4 2 2 2 2 1 1 1 4 3 2 2 4
 3 2 1 4 3 2 2 4 2 0 2 3 3 2 1 2 2 2 0 1 1 3 0 2 3 2 2 2 0 0 3 1 1 2 1]
y_test - [2 0 3 3 2 2 1 2 1 0 2 2 0 3 3 3 4 1 4 1 3 3 3 0 2 2 2 4 2 2 3 4 3 1 4 0 2
 1 4 4 1 

In [16]:
# check label encoding
print(f"Classes - {label_encoder.classes_}")
print(f"Transformations - {label_encoder.transform(label_encoder.classes_)}")

Classes - ['athletics' 'cricket' 'football' 'rugby' 'tennis']
Transformations - [0 1 2 3 4]


Now, data is pretty much ready.

# Feature Engineering
The next step is the feature engineering step. In this step, raw text data will be transformed into feature vectors and new features will be created using the existing dataset.

## Count Vectors as features
Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.

In [17]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X_train)

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
# print(f"xtrain_count - {xtrain_count[0]}")

xvalid_count =  count_vect.transform(X_test)
# print(f"xvalid_count - {xvalid_count[0]}")

In [18]:
xtrain_count.shape  # (train examples, unique words)

(442, 10333)

In [19]:
list(count_vect.vocabulary_.items())[:10]

[('pittman', 6857),
 ('drops', 2856),
 ('coach', 1782),
 ('uk', 9631),
 ('base', 882),
 ('australia', 750),
 ('world', 10205),
 ('400m', 67),
 ('hurdle', 4528),
 ('champion', 1585)]

In [20]:
print(xtrain_count[0])  # (sentence index, word mapping)  word_count

  (0, 67)	2
  (0, 281)	1
  (0, 293)	1
  (0, 341)	1
  (0, 416)	1
  (0, 427)	1
  (0, 622)	1
  (0, 625)	1
  (0, 704)	1
  (0, 705)	2
  (0, 750)	2
  (0, 796)	2
  (0, 882)	1
  (0, 919)	2
  (0, 976)	1
  (0, 996)	1
  (0, 1036)	1
  (0, 1289)	1
  (0, 1339)	1
  (0, 1585)	2
  (0, 1624)	1
  (0, 1679)	3
  (0, 1726)	1
  (0, 1782)	4
  (0, 1843)	1
  :	:
  (0, 9000)	1
  (0, 9028)	1
  (0, 9135)	1
  (0, 9292)	1
  (0, 9300)	1
  (0, 9301)	1
  (0, 9394)	1
  (0, 9409)	2
  (0, 9473)	1
  (0, 9492)	1
  (0, 9494)	1
  (0, 9495)	1
  (0, 9631)	1
  (0, 9693)	1
  (0, 9695)	1
  (0, 9783)	1
  (0, 9993)	1
  (0, 10044)	1
  (0, 10066)	3
  (0, 10154)	1
  (0, 10197)	1
  (0, 10199)	3
  (0, 10205)	3
  (0, 10265)	1
  (0, 10272)	1


## TF-IDF Vectors as features
TF-IDF score represents the relative importance of a term in the document and the entire corpus. TF-IDF score is composed by two terms: the first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

$TF(t) = \frac{\text{Number of times term t appears in a document}}{\text{Total number of terms in the document}}$

$IDF(t) = log_e \frac{\text{Total number of documents}}{\text{Number of documents with term t in it}}$

TF-IDF Vectors can be generated at different levels of input tokens (words, characters, n-grams)

### **Word Level TF-IDF**
Matrix representing tf-idf scores of every term in different documents

In [21]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             max_features=5000)
tfidf_vect.fit(X_train)
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_test)

In [22]:
xtrain_tfidf.shape  # (train examples, embedding size)

(442, 5000)

In [23]:
print(xtrain_tfidf[0])  # (sentence index, word mapping)  tf-idf value

  (0, 4974)	0.03743202251156108
  (0, 4969)	0.01808022692555903
  (0, 4939)	0.06116323801159181
  (0, 4937)	0.12954003075667303
  (0, 4936)	0.04132066398469017
  (0, 4912)	0.04676823074817047
  (0, 4862)	0.06728507548703201
  (0, 4851)	0.05109220774524617
  (0, 4825)	0.044247324218171624
  (0, 4742)	0.04607997650571246
  (0, 4716)	0.05653977450872648
  (0, 4715)	0.04370032024520131
  (0, 4698)	0.050075797166826905
  (0, 4630)	0.03505236625104993
  (0, 4629)	0.05221579751165078
  (0, 4620)	0.04268390966678205
  (0, 4584)	0.04865848062998934
  (0, 4578)	0.031634777226329944
  (0, 4538)	0.04914788700868163
  (0, 4537)	0.0422098654272356
  (0, 4531)	0.02591549539651008
  (0, 4463)	0.04132066398469017
  (0, 4419)	0.03505236625104993
  (0, 4406)	0.03234599001250806
  (0, 4357)	0.05109220774524617
  :	:
  (0, 899)	0.026752335492753024
  (0, 880)	0.0959518221119433
  (0, 846)	0.0390266647263748
  (0, 826)	0.1252679985259103
  (0, 801)	0.05653977450872648
  (0, 781)	0.06544042449842064
  (0, 64

### **N-gram Level TF-IDF**
N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams

In [24]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', 
                                   token_pattern=r'\w{1,}', 
                                   ngram_range=(2,4), # check for 2, 3, 4 grams
                                   max_features=5000)
tfidf_vect_ngram.fit(X_train)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)

In [25]:
xtrain_tfidf_ngram

<442x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 18055 stored elements in Compressed Sparse Row format>

### **Character Level TF-IDF**
Matrix representing tf-idf scores of character level n-grams in the corpus

In [26]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', 
                                         ngram_range=(2,4), 
                                         max_features=5000)
tfidf_vect_ngram_chars.fit(X_train)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_test) 

In [27]:
xtrain_tfidf_ngram_chars

<442x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 594844 stored elements in Compressed Sparse Row format>

## Word Embeddings
A word embedding is a form of representing words and documents using a dense vector representation. The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used. Word embeddings can be trained using the input corpus itself or can be generated using pre-trained word embeddings such as Glove, FastText, and Word2Vec. Any one of them can be downloaded and used as transfer learning. I'll be using GloVe.


In [28]:
# download GloVe Embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-08-19 12:55:17--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-08-19 12:55:17--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-08-19 12:55:18--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2020

In [29]:
# unzip Embedding file
import zipfile
with zipfile.ZipFile(r"./glove.6B.zip", 'r') as f:
    f.extractall(r"./GloVe")

Following snnipet shows how to use pre-trained word embeddings in the model. There are four essential steps:

1. Loading the pretrained word embeddings
2. Creating a tokenizer object
3. Transforming text documents to sequence of tokens and pad them
4. Create a mapping of token and their respective embeddings

### Loading the pretrained word embeddings

In [30]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('./GloVe/glove.6B.300d.txt')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

In [31]:
embeddings_index["the"].shape

(300,)

### Create a tokenizer 

In [32]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(X_train)
word_index = token.word_index

In [33]:
print(word_index["makes"])

799


### Transforming text documents to sequence of tokens and pad them

In [34]:
# check min length of text in train Dataset
X_train.str.len().min()

439

In [35]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=200)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=200)

In [36]:
train_seq_x.shape

(442, 200)

### Create a mapping of token

In [37]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [38]:
embedding_matrix.shape

(10334, 300)

## Text / NLP based features
A number of extra text based features can also be created which sometimes are helpful for improving text classification models. Some examples are:

1. Word Count of the documents – total number of words in the documents
2. Character Count of the documents – total number of characters in the documents
3. Average Word Density of the documents – average length of the words used in the documents
4. Puncutation Count in the Complete Essay – total number of punctuation marks in the documents
5. Upper Case Count in the Complete Essay – total number of upper count words in the documents
6. Title Word Count in the Complete Essay – total number of proper case (title) words in the documents
7. Frequency distribution of Part of Speech Tags:
	* Noun Count
	* Verb Count
	* Adjective Count
	* Adverb Count
	* Pronoun Count


### Textual Features

In [39]:
train_df = pd.DataFrame({
    "text" : X_train,
    "label": y_train
})
train_df.head()

Unnamed: 0,text,label
96,pittman drops coach uk base australia world 40...,0
171,england attempt create history first test port...,1
73,pavey focuses indoor success jo pavey miss jan...,0
353,benitez delight crucial win liverpool manager ...,2
666,hewitt overcomes wobble sydney lleyton hewitt ...,4


In [40]:
train_df['char_count'] = train_df['text'].apply(len)

train_df['word_count'] = train_df['text'].apply(lambda x: len(x.split()))

train_df['word_density'] = train_df['char_count'] / (train_df['word_count']+1)

In [41]:
train_df.head()

Unnamed: 0,text,label,char_count,word_count,word_density
96,pittman drops coach uk base australia world 40...,0,1451,201,7.183168
171,england attempt create history first test port...,1,2080,312,6.645367
73,pavey focuses indoor success jo pavey miss jan...,0,561,81,6.841463
353,benitez delight crucial win liverpool manager ...,2,1269,190,6.643979
666,hewitt overcomes wobble sydney lleyton hewitt ...,4,1076,163,6.560976


### POS Features

In [42]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [43]:
train_df['noun_count'] = train_df['text'].apply(lambda x: check_pos_tag(x, 'noun'))
train_df['verb_count'] = train_df['text'].apply(lambda x: check_pos_tag(x, 'verb'))
train_df['adj_count'] = train_df['text'].apply(lambda x: check_pos_tag(x, 'adj'))
train_df['adv_count'] = train_df['text'].apply(lambda x: check_pos_tag(x, 'adv'))
train_df['pron_count'] = train_df['text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [44]:
train_df.head()

Unnamed: 0,text,label,char_count,word_count,word_density,noun_count,verb_count,adj_count,adv_count,pron_count
96,pittman drops coach uk base australia world 40...,0,1451,201,7.183168,89,47,41,13,0
171,england attempt create history first test port...,1,2080,312,6.645367,160,61,47,22,0
73,pavey focuses indoor success jo pavey miss jan...,0,561,81,6.841463,43,15,16,2,0
353,benitez delight crucial win liverpool manager ...,2,1269,190,6.643979,80,46,43,11,3
666,hewitt overcomes wobble sydney lleyton hewitt ...,4,1076,163,6.560976,63,34,40,13,0


## Topic Modelling as Features
Topic Modelling is a technique to identify the groups of words (called a topic) from a collection of documents that contains best information in the collection. I have used Latent Dirichlet Allocation for generating Topic Modelling Features. LDA is an iterative model which starts from a fixed number of topics. Each topic is represented as a distribution over words, and each document is then represented as a distribution over topics. Although the tokens themselves are meaningless, the probability distributions over words provided by the topics provide a sense of the different ideas contained in the documents. One can read more about topic modelling [here](https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/).

In [45]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, 
                                                    learning_method='online', 
                                                    max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

In [46]:
# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [47]:
print(topic_word)
print(f"topic_summaries - {topic_summaries}")

[[ 0.0535939   0.05152525  0.0515803  ...  0.05116905  0.05152836
   0.05178971]
 [17.82846309  0.05142333  0.93115276 ...  0.05169041  0.0515823
   0.05159669]
 [ 0.05149415  0.05157861  0.05152254 ...  0.0515337   0.05156218
   0.05158628]
 ...
 [ 0.05199928  0.0516737   0.051455   ...  0.05139539  0.05143658
   0.05159432]
 [ 1.99244952  0.05187039  0.05133226 ...  0.05188035  0.0517084
   0.05163511]
 [ 0.05161962  0.05163108  0.05152399 ...  0.05171195  0.05172995
   0.05134608]]
topic_summaries - ['win great team country first last cross pavey indoor england', 'england mark first european best strauss jones indoor south one', 'high breaks knowledge moggi crease andrew weigh endorse slipped england', 'said football england would association mutu chelsea sponsorship fai rooney', 'one four said first cup final leg two ntini strauss', 'liverpool win would play said parry first final one back', 'cricket sri day lanka tour zimbabwe played one also jayasuriya', 'cup year said would davi

# Model Building
The final step in the text classification framework is to train a classifier using the features created in the previous step. There are many different choices of machine learning models which can be used to train a final model. We will implement following different classifiers for this purpose:

In [48]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

## Naive Bayes Classifier

Naive Bayes is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. A Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature

### NB on Count Vectors (One hot)

In [49]:
# create NB Classifier
NB_count_vec = naive_bayes.MultinomialNB()

# train NB
NB_count_vec.fit(xtrain_count, y_train)

# get predictions
preds = NB_count_vec.predict(xvalid_count)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("NB, Count Vectors: ", accuracy)
print(metrics.classification_report(y_test, preds))

NB, Count Vectors:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00       106
           3       1.00      1.00      1.00        59
           4       1.00      1.00      1.00        40

    accuracy                           1.00       295
   macro avg       1.00      1.00      1.00       295
weighted avg       1.00      1.00      1.00       295



In [50]:
# select some random examples from test set and check their labels
random.seed(47)  # for reproducible results
examples_index = random.choices(range(X_test.shape[0]), k=5)  # of shuffled data
examples_index_orig = X_test.iloc[examples_index].reset_index()["index"] # of original data
# print(examples_index, examples_index_orig)

temp_df = raw_df.iloc[X_test.iloc[examples_index].reset_index()["index"]].copy()
temp_df["Predicted"] = label_encoder.inverse_transform(preds[examples_index])

for idx, description, actual_lbl, predicted_lbl in temp_df.itertuples():
    print(f"\nDescription:\n", description.replace('\n', ' '))
    print(f"Actual Label - {actual_lbl}\t\tPredicted - {predicted_lbl}")


Description:
 Benitez deflects blame from Dudek  Liverpool manager Rafael Benitez has refused to point the finger of blame at goalkeeper Jerzy Dudek after Portsmouth claimed a draw at Anfield.  Dudek fumbled a cross before Lomana LuaLua headed home an injury-time equaliser, levelling after Steven Gerrard put Liverpool ahead. Benitez said: "It was difficult for Jerzy. It was an unlucky moment. "He was expecting a cross from Matthew Taylor and it ended up like a shot, so I don't blame him for what happened." Benitez admitted it was a costly loss of two points by Liverpool, who followed up their derby defeat against Everton with a disappointing draw. He said: "We had many opportunities but didn't score and, in the end, a 1-0 lead was not enough. "If you don't have any chances you have to think of other things, but when you are creating so many chances as we are there is nothing you can say to the players. It was a pity. "We lost two points, but we have one more point in the table. Now we

### NB on TF-IDF


In [51]:
# create NB Classifier
NB_tf_idf = naive_bayes.MultinomialNB()

# train NB
NB_tf_idf.fit(xtrain_tfidf, y_train)

# get predictions
preds = NB_tf_idf.predict(xvalid_tfidf)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("NB, WordLevel TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

NB, WordLevel TF-IDF:  0.9559322033898305
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       1.00      1.00      1.00        50
           2       0.89      1.00      0.94       106
           3       1.00      0.88      0.94        59
           4       1.00      0.93      0.96        40

    accuracy                           0.96       295
   macro avg       0.98      0.95      0.96       295
weighted avg       0.96      0.96      0.96       295



In [52]:
# select some random examples from test set and check their labels
random.seed(47)  # for reproducible results
examples_index = random.choices(range(X_test.shape[0]), k=15)  # of shuffled data
examples_index_orig = X_test.iloc[examples_index].reset_index()["index"] # of original data
# print(examples_index, examples_index_orig)

temp_df = raw_df.iloc[X_test.iloc[examples_index].reset_index()["index"]].copy()
temp_df["Predicted"] = label_encoder.inverse_transform(preds[examples_index])

for idx, description, actual_lbl, predicted_lbl in temp_df.itertuples():
    # print(f"\nDescription:\n", description.replace('\n', ' '))
    print(f"Actual Label - {actual_lbl}\t\tPredicted - {predicted_lbl}")

Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - tennis		Predicted - tennis
Actual Label - athletics		Predicted - athletics
Actual Label - athletics		Predicted - athletics
Actual Label - athletics		Predicted - football
Actual Label - athletics		Predicted - athletics
Actual Label - cricket		Predicted - cricket
Actual Label - rugby		Predicted - football
Actual Label - cricket		Predicted - cricket
Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - rugby		Predicted - rugby
Actual Label - football		Predicted - football


### NB on Ngram Level TF IDF Vectors

In [53]:
# create NB Classifier
NB_ngram_tf_idf = naive_bayes.MultinomialNB()

# train NB
NB_ngram_tf_idf.fit(xtrain_tfidf_ngram, y_train)

# get predictions
preds = NB_ngram_tf_idf.predict(xvalid_tfidf_ngram)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("NB, N-Gram TF-IDF Vectors: ", accuracy)
print(metrics.classification_report(y_test, preds))

NB, N-Gram TF-IDF Vectors:  0.9559322033898305
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        40
           1       1.00      0.98      0.99        50
           2       0.89      1.00      0.94       106
           3       1.00      0.92      0.96        59
           4       1.00      0.95      0.97        40

    accuracy                           0.96       295
   macro avg       0.98      0.94      0.96       295
weighted avg       0.96      0.96      0.96       295



In [54]:
# select some random examples from test set and check their labels
random.seed(47)  # for reproducible results
examples_index = random.choices(range(X_test.shape[0]), k=15)  # of shuffled data
examples_index_orig = X_test.iloc[examples_index].reset_index()["index"] # of original data
# print(examples_index, examples_index_orig)

temp_df = raw_df.iloc[X_test.iloc[examples_index].reset_index()["index"]].copy()
temp_df["Predicted"] = label_encoder.inverse_transform(preds[examples_index])

for idx, description, actual_lbl, predicted_lbl in temp_df.itertuples():
    # print(f"\nDescription:\n", description.replace('\n', ' '))
    print(f"Actual Label - {actual_lbl}\t\tPredicted - {predicted_lbl}")

Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - tennis		Predicted - tennis
Actual Label - athletics		Predicted - athletics
Actual Label - athletics		Predicted - athletics
Actual Label - athletics		Predicted - football
Actual Label - athletics		Predicted - athletics
Actual Label - cricket		Predicted - cricket
Actual Label - rugby		Predicted - rugby
Actual Label - cricket		Predicted - cricket
Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - rugby		Predicted - rugby
Actual Label - football		Predicted - football


### NB on Character Level TF IDF Vectors

In [55]:
# create NB Classifier
NB_chars_tf_idf = naive_bayes.MultinomialNB()

# train NB
NB_chars_tf_idf.fit(xtrain_tfidf_ngram_chars, y_train)

# get predictions
preds = NB_chars_tf_idf.predict(xvalid_tfidf_ngram_chars)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("NB, CharLevel TF-IDF Vectors: ", accuracy)
print(metrics.classification_report(y_test, preds))

NB, CharLevel TF-IDF Vectors:  0.7152542372881356
              precision    recall  f1-score   support

           0       1.00      0.68      0.81        40
           1       1.00      0.74      0.85        50
           2       0.56      1.00      0.72       106
           3       0.97      0.63      0.76        59
           4       1.00      0.10      0.18        40

    accuracy                           0.72       295
   macro avg       0.91      0.63      0.66       295
weighted avg       0.84      0.72      0.69       295



In [56]:
# select some random examples from test set and check their labels
random.seed(47)  # for reproducible results
examples_index = random.choices(range(X_test.shape[0]), k=15)  # of shuffled data
examples_index_orig = X_test.iloc[examples_index].reset_index()["index"] # of original data
# print(examples_index, examples_index_orig)

temp_df = raw_df.iloc[X_test.iloc[examples_index].reset_index()["index"]].copy()
temp_df["Predicted"] = label_encoder.inverse_transform(preds[examples_index])

for idx, description, actual_lbl, predicted_lbl in temp_df.itertuples():
    # print(f"\nDescription:\n", description.replace('\n', ' '))
    print(f"Actual Label - {actual_lbl}\t\tPredicted - {predicted_lbl}")

Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - tennis		Predicted - football
Actual Label - athletics		Predicted - athletics
Actual Label - athletics		Predicted - football
Actual Label - athletics		Predicted - football
Actual Label - athletics		Predicted - football
Actual Label - cricket		Predicted - cricket
Actual Label - rugby		Predicted - football
Actual Label - cricket		Predicted - cricket
Actual Label - football		Predicted - football
Actual Label - football		Predicted - football
Actual Label - rugby		Predicted - football
Actual Label - football		Predicted - football


## Linear Classifier
Linear Classifier (Logistic Regression) measures the relationship between the categorical dependent variable and one or more independent variables by estimating probabilities using a logistic/sigmoid function.

### LR on Count Vectors

In [57]:
# create LR Classifier
LR_count_vec = linear_model.LogisticRegression()

# train LR
LR_count_vec.fit(xtrain_count, y_train)

# get predictions
preds = LR_count_vec.predict(xvalid_count)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("LR, Count Vectors: ", accuracy)
print(metrics.classification_report(y_test, preds))

LR, Count Vectors:  0.9864406779661017
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       1.00      1.00      1.00        50
           2       0.96      1.00      0.98       106
           3       1.00      0.97      0.98        59
           4       1.00      0.97      0.99        40

    accuracy                           0.99       295
   macro avg       0.99      0.98      0.99       295
weighted avg       0.99      0.99      0.99       295



### LR on TF-IDF

In [58]:
# create LR Classifier
LR_tf_idf = linear_model.LogisticRegression()

# train LR
LR_tf_idf.fit(xtrain_tfidf, y_train)

# get predictions
preds = LR_tf_idf.predict(xvalid_tfidf)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("LR, Word Level TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

LR, Word Level TF-IDF:  0.9796610169491525
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       1.00      1.00      1.00        50
           2       0.95      1.00      0.97       106
           3       1.00      0.93      0.96        59
           4       1.00      0.97      0.99        40

    accuracy                           0.98       295
   macro avg       0.99      0.98      0.98       295
weighted avg       0.98      0.98      0.98       295



### LR on Ngram Level TF IDF Vectors

In [59]:
# create LR Classifier
LR_ngram_tf_idf = linear_model.LogisticRegression()

# train LR
LR_ngram_tf_idf.fit(xtrain_tfidf_ngram, y_train)

# get predictions
preds = LR_ngram_tf_idf.predict(xvalid_tfidf_ngram)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("LR, N-gram TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

LR, N-gram TF-IDF:  0.9423728813559322
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       1.00      0.96      0.98        50
           2       0.86      1.00      0.93       106
           3       1.00      0.90      0.95        59
           4       1.00      0.93      0.96        40

    accuracy                           0.94       295
   macro avg       0.97      0.93      0.95       295
weighted avg       0.95      0.94      0.94       295



### LR on Character Level TF IDF Vectors

In [60]:
# create LR Classifier
LR_chars_tf_idf = linear_model.LogisticRegression()

# train LR
LR_chars_tf_idf.fit(xtrain_tfidf_ngram_chars, y_train)

# get predictions
preds = LR_chars_tf_idf.predict(xvalid_tfidf_ngram_chars)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("LR, Char Level TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

LR, Char Level TF-IDF:  0.9661016949152542
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       1.00      1.00      1.00        50
           2       0.91      1.00      0.95       106
           3       1.00      0.86      0.93        59
           4       1.00      0.97      0.99        40

    accuracy                           0.97       295
   macro avg       0.98      0.96      0.97       295
weighted avg       0.97      0.97      0.97       295



## Support Vector Machines (SVM)

In [61]:
# create SVM Classifier
SVM_ngram_tf_idf = svm.SVC()

# train SVM
SVM_ngram_tf_idf.fit(xtrain_tfidf_ngram, y_train)

# get predictions
preds = SVM_ngram_tf_idf.predict(xvalid_tfidf_ngram)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("SVM, N-gram TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

SVM, N-gram TF-IDF:  0.9254237288135593
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       1.00      0.94      0.97        50
           2       0.83      1.00      0.91       106
           3       1.00      0.85      0.92        59
           4       1.00      0.90      0.95        40

    accuracy                           0.93       295
   macro avg       0.97      0.91      0.93       295
weighted avg       0.94      0.93      0.93       295



## Random Forest

In [62]:
# create RF Classifier
RF_ngram_tf_idf = ensemble.RandomForestClassifier()

# train RF
RF_ngram_tf_idf.fit(xtrain_tfidf_ngram, y_train)

# get predictions
preds = RF_ngram_tf_idf.predict(xvalid_tfidf_ngram)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("RF, N-gram TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

RF, N-gram TF-IDF:  0.9050847457627119
              precision    recall  f1-score   support

           0       1.00      0.78      0.87        40
           1       1.00      0.92      0.96        50
           2       0.79      1.00      0.88       106
           3       1.00      0.83      0.91        59
           4       1.00      0.88      0.93        40

    accuracy                           0.91       295
   macro avg       0.96      0.88      0.91       295
weighted avg       0.92      0.91      0.91       295



## XG Boost

In [63]:
# create XG Boost classifier
xgb_count_vec = xgboost.XGBClassifier()

# train XGB
xgb_count_vec.fit(xtrain_count, y_train)

# get predictions
preds = xgb_count_vec.predict(xvalid_count)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("RF, N-gram TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

RF, N-gram TF-IDF:  0.9627118644067797
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.96      0.92      0.94        50
           2       0.94      0.98      0.96       106
           3       0.96      0.93      0.95        59
           4       1.00      1.00      1.00        40

    accuracy                           0.96       295
   macro avg       0.97      0.96      0.97       295
weighted avg       0.96      0.96      0.96       295



In [64]:
# create XG Boost classifier
xgb_count_vec = xgboost.XGBClassifier()

# train XGB
xgb_count_vec.fit(xtrain_tfidf, y_train)

# get predictions
preds = xgb_count_vec.predict(xvalid_tfidf)

# accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("RF, N-gram TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

RF, N-gram TF-IDF:  0.9661016949152542
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.96      0.96      0.96        50
           2       0.95      0.98      0.97       106
           3       0.96      0.92      0.94        59
           4       0.98      1.00      0.99        40

    accuracy                           0.97       295
   macro avg       0.97      0.97      0.97       295
weighted avg       0.97      0.97      0.97       295



## Shallow Neural Networks

In [65]:
def get_model(input_shape):
    input_layer = layers.Input((input_shape, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(5, activation="softmax")(hidden_layer)

    model = models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy')
    return model 

In [66]:
# create model and get its summary
model = get_model(xtrain_tfidf_ngram.shape[1])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5000)]            0         
_________________________________________________________________
dense (Dense)                (None, 100)               500100    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 500,605
Trainable params: 500,605
Non-trainable params: 0
_________________________________________________________________


In [67]:
history = model.fit(xtrain_tfidf_ngram, y_train)



In [68]:
# get predictions
preds = model.predict(xvalid_tfidf_ngram)
preds = preds.argmax(axis=-1)
print(preds)

[2 0 2 3 2 2 1 2 1 0 2 2 2 3 2 3 4 1 4 1 2 3 3 0 2 2 2 2 2 2 2 4 3 1 4 2 2
 1 4 2 1 2 1 2 0 1 2 2 3 2 2 0 2 4 1 2 2 2 2 3 2 2 2 4 2 2 1 0 3 2 4 2 1 1
 0 2 2 1 2 1 2 1 3 2 1 2 3 2 1 1 2 2 3 2 2 2 2 4 2 2 0 4 2 2 2 3 2 2 3 3 1
 3 1 2 2 0 2 1 3 1 2 2 2 2 3 2 2 0 0 2 0 2 2 2 2 1 2 4 0 1 2 1 2 2 1 4 2 2
 2 2 2 0 3 2 0 0 2 2 4 4 1 2 4 1 0 2 1 2 1 2 2 2 0 0 2 2 2 2 4 1 2 0 1 0 0
 2 2 2 2 1 2 2 2 1 0 2 2 2 2 2 2 4 1 2 3 2 1 2 2 3 2 2 2 1 3 1 1 2 4 3 1 1
 1 2 2 2 2 3 3 4 1 3 2 2 2 2 3 0 2 2 2 1 2 2 2 2 4 4 2 2 2 2 3 1 2 2 2 1 4
 2 0 2 2 0 3 2 2 3 4 4 2 4 2 2 4 1 1 2 2 2 1 2 2 3 2 0 2 2 2 3 2 0 3 4 1]


In [69]:
# get accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("Shallow NN, N-gram TF-IDF: ", accuracy)
print(metrics.classification_report(y_test, preds))

Shallow NN, N-gram TF-IDF:  0.8271186440677966
              precision    recall  f1-score   support

           0       1.00      0.70      0.82        40
           1       0.98      1.00      0.99        50
           2       0.68      1.00      0.81       106
           3       1.00      0.56      0.72        59
           4       1.00      0.68      0.81        40

    accuracy                           0.83       295
   macro avg       0.93      0.79      0.83       295
weighted avg       0.88      0.83      0.82       295



## Deep Neural Network

### 1D CNN 

In [70]:
def create_cnn(inp_shape):
    # Add an Input Layer
    input_layer = layers.Input((inp_shape, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(input_dim=len(word_index) + 1, 
                                       output_dim=300, 
                                       weights=[embedding_matrix], 
                                       trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(5, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy')
    
    return model

In [71]:
# model summary
cnn_1d = create_cnn(train_seq_x.shape[1])
cnn_1d.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          3100200   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 300)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 198, 100)          90100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout (Dropout)            (None, 50)               

In [72]:
# training
history = cnn_1d.fit(train_seq_x, y_train, epochs=1)



In [73]:
# get predictions
preds = cnn_1d.predict(valid_seq_x)
preds = preds.argmax(axis=-1)
print(preds)

[2 3 2 3 2 2 1 2 3 3 2 2 3 3 3 3 2 1 2 1 3 3 3 3 2 2 2 4 2 2 3 2 3 3 4 3 2
 1 2 4 1 3 3 3 3 3 2 2 3 3 2 3 2 4 3 3 3 2 2 3 3 3 2 2 3 2 1 3 3 2 3 3 3 3
 3 2 2 3 2 3 2 1 3 2 1 2 3 3 1 3 3 3 3 2 3 2 2 4 2 3 3 4 2 2 2 3 3 2 3 3 3
 3 1 2 2 2 2 1 3 3 2 2 2 3 2 2 2 3 3 3 3 2 2 2 3 3 2 4 3 1 2 3 2 2 1 4 2 2
 2 3 3 3 3 3 3 3 3 2 3 4 1 3 4 1 3 2 3 2 3 2 3 2 3 3 2 2 2 2 4 3 3 3 3 2 3
 4 2 2 2 3 2 3 2 1 3 2 2 2 2 2 2 2 3 2 2 3 1 2 3 3 3 2 2 3 3 3 1 2 4 3 3 1
 3 3 2 2 2 3 3 3 3 3 2 2 2 4 3 3 3 2 2 3 2 3 2 3 3 4 2 3 2 2 3 3 2 3 2 1 2
 3 3 2 2 3 3 3 2 3 2 2 2 2 2 2 4 3 1 2 2 2 1 2 2 3 2 3 2 2 2 3 2 3 3 4 1]


In [74]:
# get accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("Deep 1D CNN, Sequenced Text: ", accuracy)
print(metrics.classification_report(y_test, preds))

Deep 1D CNN, Sequenced Text:  0.6677966101694915
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       1.00      0.46      0.63        50
           2       0.83      0.97      0.90       106
           3       0.41      0.92      0.57        59
           4       1.00      0.42      0.60        40

    accuracy                           0.67       295
   macro avg       0.65      0.55      0.54       295
weighted avg       0.69      0.67      0.62       295




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### RNN - LSTM

In [75]:
def create_rnn_lstm(inp_shape):
    # Add an Input Layer
    input_layer = layers.Input((inp_shape, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(input_dim=len(word_index) + 1, 
                                       output_dim=300, 
                                       weights=[embedding_matrix], 
                                       trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100))((embedding_layer))

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(5, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy')
    
    return model

In [76]:
# model summary
rnn_lstm = create_rnn_lstm(train_seq_x.shape[1])
rnn_lstm.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          3100200   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               320800    
_________________________________________________________________
dense_4 (Dense)              (None, 50)                10050     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                

In [77]:
# training
history = rnn_lstm.fit(train_seq_x, y_train, epochs=1)



In [78]:
# get predictions
preds = rnn_lstm.predict(valid_seq_x)
preds = preds.argmax(axis=-1)
print(preds)

[2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 3 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2
 1 2 2 3 2 2 3 2 1 2 2 2 2 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 1 2
 2 2 2 1 2 2 2 1 3 2 1 2 2 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 1
 2 1 2 2 2 2 1 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 3 2 2 2 3 1 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 4 1 2 2 1 2 2 1 2 2 2 3 2 2 2 2 2 2 2 2 2 3 0 3 0 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
 2 2 2 2 2 3 2 2 3 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 3 2 2 2 2 2 2 2 2 3 2 2]


In [79]:
# get accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("Deep RNN-LSTM, Sequenced Text: ", accuracy)
print(metrics.classification_report(y_test, preds))

Deep RNN-LSTM, Sequenced Text:  0.5220338983050847
              precision    recall  f1-score   support

           0       1.00      0.07      0.14        40
           1       1.00      0.50      0.67        50
           2       0.43      1.00      0.61       106
           3       0.86      0.32      0.47        59
           4       1.00      0.03      0.05        40

    accuracy                           0.52       295
   macro avg       0.86      0.38      0.39       295
weighted avg       0.77      0.52      0.45       295



### RNN - GRU

In [80]:
def create_rnn_GRU(inp_shape):
    # Add an Input Layer
    input_layer = layers.Input((inp_shape, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(input_dim=len(word_index) + 1, 
                                       output_dim=300, 
                                       weights=[embedding_matrix], 
                                       trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    GRU_layer = layers.Bidirectional(layers.GRU(100))((embedding_layer))

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(GRU_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(5, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='sparse_categorical_crossentropy')
    
    return model

In [81]:
# model summary
rnn_gru = create_rnn_lstm(train_seq_x.shape[1])
rnn_gru.summary()

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 300)          3100200   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_6 (Dense)              (None, 50)                10050     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 5)                

In [82]:
# training
history = rnn_gru.fit(train_seq_x, y_train, epochs=1)



In [83]:
# get predictions
preds = rnn_gru.predict(valid_seq_x)
preds = preds.argmax(axis=-1)
print(preds)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 3 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2
 1 2 2 1 2 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2
 2 2 2 1 2 2 2 1 1 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1
 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 4 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 4 1 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1
 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2
 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 3 2 2 2 2 1 2 2 2 1 2 1]


In [84]:
# get accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("Deep RNN-LSTM, Sequenced Text: ", accuracy)
print(metrics.classification_report(y_test, preds))

Deep RNN-LSTM, Sequenced Text:  0.4711864406779661
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.69      0.62      0.65        50
           2       0.43      0.99      0.60       106
           3       1.00      0.03      0.07        59
           4       0.50      0.03      0.05        40

    accuracy                           0.47       295
   macro avg       0.52      0.33      0.27       295
weighted avg       0.54      0.47      0.34       295




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### RCNN

In [85]:
def create_rcnn(inp_shape):
    # Add an Input Layer
    input_layer = layers.Input((inp_shape, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(input_dim=len(word_index) + 1, 
                                       output_dim=300, 
                                       weights=[embedding_matrix], 
                                       trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

In [86]:
# model summary
rcnn = create_rcnn(train_seq_x.shape[1])
rcnn.summary()

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 200, 300)          3100200   
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 200, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 198, 100)          90100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)               

In [87]:
# training
history = rcnn.fit(train_seq_x, y_train, epochs=1)



In [88]:
# get predictions
preds = rcnn.predict(valid_seq_x)
preds = preds.argmax(axis=-1)
print(preds)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [89]:
# get accuracy
accuracy = metrics.accuracy_score(y_test, preds)
print("Deep RNN-LSTM, Sequenced Text: ", accuracy)
print(metrics.classification_report(y_test, preds))

Deep RNN-LSTM, Sequenced Text:  0.13559322033898305
              precision    recall  f1-score   support

           0       0.14      1.00      0.24        40
           1       0.00      0.00      0.00        50
           2       0.00      0.00      0.00       106
           3       0.00      0.00      0.00        59
           4       0.00      0.00      0.00        40

    accuracy                           0.14       295
   macro avg       0.03      0.20      0.05       295
weighted avg       0.02      0.14      0.03       295




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

