# 1. Dataset Preparation

In [124]:
import pandas as pd
import nltk
from nltk.corpus import reuters
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [125]:
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [129]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords

In [126]:
#nltk.download()

In [174]:
documents = reuters.fileids()
print(str(len(documents)) + " documents")
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
print(str(len(train_docs))+ ' total train docs')
test_docs = list(filter(lambda doc: doc.startswith("test"),documents))
print(str(len(test_docs)) + ' total test docs')
categories = reuters.categories()
print(str(len(categories)) + " total categories")
train_y = [reuters.categories(i)[0] for i in train_docs]
test_y = [reuters.categories(i)[0] for i in test_docs]

10788 documents
7769 total train docs
3019 total test docs
90 total categories


In [181]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y_coded = encoder.fit_transform(train_y)
test_y_coded = encoder.fit_transform(test_y)

In [131]:
trainDF = pd.DataFrame()

In [132]:
train_n_paragraph, train_texts = [],[]

for i in train_docs:
    train_texts.append(''.join(reuters.raw(i).split('\n')))
    train_n_paragraph.append(reuters.raw(i).count('\n'))
    

In [133]:
trainDF['texts']=train_texts
trainDF['n_paragraph']=train_n_paragraph

In [134]:
test_n_paragraph, test_texts = [],[]

for i in test_docs:
    test_texts.append(''.join(reuters.raw(i).split('\n')))
    test_n_paragraph.append(reuters.raw(i).count('\n'))
    

In [135]:
testDF = pd.DataFrame()
testDF['texts']=test_texts
testDF['n_paragraph']=test_n_paragraph

In [136]:
dataset=pd.concat([trainDF,testDF])

In [199]:
dataset.reset_index(inplace=True)

In [200]:
cachedStopWords = stopwords.words("english")

In [201]:
dataset['texts'][1]

"COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES SALE  Computer Terminal Systems Inc said  it has completed the sale of 200,000 shares of its common  stock, and warrants to acquire an additional one mln shares, to  &lt;Sedio N.V.> of Lugano, Switzerland for 50,000 dlrs.      The company said the warrants are exercisable for five  years at a purchase price of .125 dlrs per share.      Computer Terminal said Sedio also has the right to buy  additional shares and increase its total holdings up to 40 pct  of the Computer Terminal's outstanding common stock under  certain circumstances involving change of control at the  company.      The company said if the conditions occur the warrants would  be exercisable at a price equal to 75 pct of its common stock's  market price at the time, not to exceed 1.50 dlrs per share.      Computer Terminal also said it sold the technolgy rights to  its Dot Matrix impact technology, including any future  improvements, to &lt;Woodco Inc> of Houston, Tex. for 2

### Remove stop word

In [205]:
dataset['texts_r_sw']=dataset['texts'].apply(lambda x: [ word  for word in x.split() if word not in cachedStopWords])

In [217]:
dataset['sw_percentage']=dataset['texts_r_sw'].apply(len)/dataset['texts'].apply(lambda x: len(x.split()))

### Word Stemming

In [222]:
stemming=PorterStemmer()

In [230]:
dataset['texts_sw_st']=dataset['texts_r_sw'].apply(lambda x: [stemming.stem(i) for i in x])

### Word tokenization

In [233]:
dataset['texts_sw_st'][:10]

0    [bahia, cocoa, review, shower, continu, throug...
1    [comput, termin, system, &lt;cpml>, complet, s...
2    [n.z., trade, bank, deposit, growth, rise, sli...
3    [nation, amus, again, up, viacom, &lt;via>, bi...
4    [roger, &lt;rog>, see, 1st, qtr, net, UP, sign...
5    [island, telephon, share, split, approv, &lt;i...
6    [u.k., grow, impati, with, japan, -, thatcher,...
7    [questech, inc, &lt;qtec>, year, net, shr, los...
8    [canada, oil, export, rise, 20, pct, IN, 1986,...
9    [coffee,, sugar, and, cocoa, exchang, name, ch...
Name: texts_sw_st, dtype: object

In [240]:
dataset['texts_tk']=dataset['texts_sw_st'].apply(lambda x: word_tokenize(' '.join(x))) 

# 2. Feature Engeering 

#2.1 Count Vectors as features
#2.2 TF-IDF Vectors as features
    Word level
    N-Gram level
    Character level
#2.3 Word Embeddings as features
#2.4 Text / NLP based features
#2.5 Topic Models as features

## 2.1 Count Vectors as features

In [137]:
#create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern = r'\w{1,}')
count_vect.fit(dataset['texts'])
#transform the training and testing data using count vectorizer object
xtrain_count = count_vect.transform(trainDF['texts'])
xtest_count = count_vect.transform(testDF['texts'])

In [242]:
len(count_vect.get_feature_names())

31029

In [244]:
len(trainDF['texts'])

7769

In [255]:
dataset['texts_tk']=dataset['texts_tk'].apply(' '.join)

In [256]:
count_vect_tk = CountVectorizer(analyzer='word', token_pattern = r'\w{1,}')
count_vect_tk.fit(dataset['texts_tk'])
xtrain_count_tk = count_vect_tk.transform(dataset['texts_tk'][:7768])
xtest_count_tk = count_vect_tk.transform(dataset['texts_tk'][7769:])

In [258]:
len(count_vect_tk.get_feature_names())

28633

In [260]:
xtrain_count_tk.toarray().shape

(7768, 28633)

## 2.2TF-IDF Vectors as features

#Word Level TF-IDF
#N-gram Level TF-IDF
#Character Level TF-IDF

In [138]:
# Word level TF-IDF
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(dataset['texts'])
xtrain_tfidf = tfidf_vect.transform(trainDF['texts'])
xtest_tfidf = tfidf_vect.transform(testDF['texts'])

In [264]:
# Word level TF-IDF
tfidf_vect_tk = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=30000)
tfidf_vect_tk.fit(dataset['texts_tk'])
xtrain_tfidf_tk = tfidf_vect.transform(dataset['texts_tk'][:7768])
xtest_tfidf_tk = tfidf_vect.transform(dataset['texts_tk'][7769:])

In [267]:
#tfidf_vect_tk.get_feature_names()

In [139]:
# N-gram level TF-IDF
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(dataset['texts'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(trainDF['texts'])
xtest_tfidf_ngram =tfidf_vect_ngram.transform(testDF['texts'])

In [272]:
tfidf_vect_ngram_tk = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3))
tfidf_vect_ngram_tk.fit(dataset['texts_tk'])
xtrain_tfidf_ngram_tk = tfidf_vect_ngram.transform(dataset['texts_tk'][:7768])
xtest_tfidf_ngram_tk =tfidf_vect_ngram.transform(dataset['texts_tk'][7769:])

In [273]:
len(tfidf_vect_ngram_tk.get_feature_names())

1169949

In [140]:
# Character level TF-IDF
tfidf_vect_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}',ngram_range=(2,3),max_features=5000)
tfidf_vect_char.fit(dataset['texts'])
xtrain_tfidf_char = tfidf_vect_char.transform(trainDF['texts'])
xtest_tfidf_char = tfidf_vect_char.transform(testDF['texts'])

In [141]:
type(xtest_tfidf)

scipy.sparse.csr.csr_matrix

## 2.3 Word Embedding

In [142]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('/Users/Hans/Documents/Wei/Springboard_bootcamp/project1/wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(dataset['texts'])
word_index = token.word_index

In [153]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(trainDF['texts']), maxlen=70)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['texts']), maxlen=70)

In [166]:
train_seq_x.

array([[ 1271,    72,   427, ...,    16,   100,   312],
       [11873,   792,   941, ...,  8612,     5,  5219],
       [   44,    15,     4, ...,     4,   103,    45],
       ...,
       [    0,     0,     0, ...,   144,    90,   338],
       [   48,    17, 18090, ...,     9,    66,     8],
       [    0,     0,     0, ...,   263,    54,     8]], dtype=int32)

In [155]:
# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [156]:
type(embedding_matrix)

numpy.ndarray

## 2.4 Text/NLP based features

#Noun Count
#Verb Count
#Abjective Count
#Adverb Count
#Pronoun Count

In [157]:
dataset['char_count']=dataset['texts'].apply(len)

In [158]:
dataset['word_count']=dataset['texts'].apply(lambda x: len(x.split()))

In [159]:
dataset['word_density']=dataset['char_count']/(dataset['word_count']+1)

In [160]:
dataset['punctuation_count']=dataset['texts'].apply(lambda x: len(''.join(_ for _ in x if _ in string.punctuation)))

In [161]:
dataset['title_word_count']=dataset['texts'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))

In [162]:
dataset['upper_case_word_count']=dataset['texts'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [163]:
#nltk.download()

In [164]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

dataset['noun_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'noun'))
dataset['verb_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'verb'))
dataset['adj_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'adj'))
dataset['adv_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'adv'))
dataset['pron_count'] = dataset['texts'].apply(lambda x: check_pos_tag(x, 'pron'))

## 2.5 Topic Models as features

In [170]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_tfidf)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

In [171]:
# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

# 3 Model Building

In [172]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, test_y)

## 3.1 Naive Bayes

In [173]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy) 

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.2494203378602186
NB, WordLevel TF-IDF:  0.24113945014905597
NB, N-Gram Vectors:  0.23782709506459093
NB, CharLevel Vectors:  0.23716462404769792


## 3.2 Linear Classifier

In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("LR, CharLevel Vectors: ", accuracy)

## 3.3 SVM 

In [89]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)



SVM, N-Gram Vectors:  0.35872805564756544


In [90]:
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xtest_tfidf)
print("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.35872805564756544


## 3.4 Bagging Model

In [91]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count)
print("RF, Count Vectors: ", accuracy) 

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)



RF, Count Vectors:  0.7459423650215303




RF, WordLevel TF-IDF:  0.7671414375621066


## 3.5 Boosting Model

In [92]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xtest_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xtest_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)


Xgb, Count Vectors:  0.8691619741636304
Xgb, WordLevel TF-IDF:  0.8678370321298443


NameError: name 'xtrain_tfidf_chars' is not defined

In [93]:
# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_char.tocsc(), train_y, xtest_tfidf_char.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

Xgb, CharLevel Vectors:  0.8618747929778072


In [96]:
xtrain_tfidf_ngram.shape

(7769, 5000)

array([ 6,  0, 46, ..., 21, 21, 21])

In [168]:

def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf, train_y, xtrain_tfidf, is_neural_net=True)


Epoch 1/1


ValueError: setting an array element with a sequence.

In [110]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, test_seq_x, is_neural_net=True)


Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1


In [111]:
print("CNN, Word Embeddings",  accuracy)

CNN, Word Embeddings 0.23815833057303742


In [119]:
def create_rnn_lstm():
 # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, test_seq_x, is_neural_net=True)
print("RNN-LSTM, Word Embeddings",  accuracy)

Epoch 1/1
RNN-LSTM, Word Embeddings 0.23815833057303742


In [120]:
def create_rnn_gru():
# Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_gru()
accuracy = train_model(classifier, train_seq_x, train_y, test_seq_x, is_neural_net=True)
print("RNN-GRU, Word Embeddings",  accuracy)





Epoch 1/1
RNN-GRU, Word Embeddings 0.23815833057303742


In [None]:
def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_bidirectional_rnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-Bidirectional, Word Embeddings",  accuracy)

In [121]:
def create_rcnn():
# Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rcnn()
accuracy = train_model(classifier, train_seq_x, train_y, test_seq_x, is_neural_net=True)
print("CNN, Word Embeddings",  accuracy)

Epoch 1/1
CNN, Word Embeddings 0.23815833057303742
