In [2]:
# Import the libraries

import os
import sys
from IPython.display import HTML, display

import numpy as np
import pandas as pd
import tensorflow as tf
from math import ceil
from scipy.spatial.distance import cosine

import matplotlib.pyplot as plt
import seaborn as sns

import collections
import random
import time
import string
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, Dense, Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dropout, LSTM, Bidirectional

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Preprocessing

In [3]:
# Progress bar
def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

# Save a word2vec dictionary.
def save_word2vec(filename):
    with open(os.path.join('/Users/haoxing/Documents/Work/Teaching/Machine learning for Finance/Codes/NLP/NLP_app', filename),'a' , encoding='utf-8') as f :
        for k, v in word2vec.items():
            line = k+' '+str(list(v)).strip('[]').replace(',','')+'\n'
            f.write(line)

# Load a word2vec dictionary.
def load_word2vec(filename):
    word2vec = {}
    with open(os.path.join('/Users/haoxing/Documents/Work/Teaching/Machine learning for Finance/Codes/NLP/NLP_app', filename), encoding='utf8') as f:
        for line in f:
            try :
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
            except :
                None
    return word2vec

# read the repo in PATH and append the texts in a list
def get_data(PATH):
    list_dir = os.listdir(PATH)
    texts = []
    fund_names = []
    out = display(progress(0, len(list_dir)-1), display_id=True)
    for ii, filename in enumerate(list_dir) :
        with open(PATH+'/'+filename, 'r', encoding="utf8") as f :
            txt = f.read()
            try :
                txt_split = txt.split('<head_breaker>')
                summary = txt_split[1].strip()
                fund_name = txt_split[0].strip()
            except :
                summary = txt
                fund_name = ''
        texts.append(summary)
        fund_names.append(fund_name)
        out.update(progress(ii, len(list_dir)-1))
    return fund_names, texts

In [4]:
SUMMARY_PATH = 'MutualFundSummary'
SUMMARY_LABELS_PATH = 'MutualFundLabels.csv'

In [5]:
fund_names, summaries = get_data(SUMMARY_PATH)

In [6]:
df_label = pd.read_csv(SUMMARY_LABELS_PATH)
df_label.head()

Unnamed: 0,id,fund_name,Performance fee?,Ivestment Strategy,Leverage?,Portfolio composition,Concentration
0,0000051931-18-000151,American Funds College 2018 Fund,,Balanced Fund (Low Risk),Yes,Investment grade securities,Diversified
1,0000051931-18-000151,American Funds College 2021 Fund,,Balanced Fund (Low Risk),Yes,Investment grade securities,Diversified
2,0000051931-18-000151,American Funds College 2024 Fund,,Balanced Fund (Low Risk),Yes,Investment grade securities,Diversified
3,0000051931-18-000151,American Funds College 2027 Fund,,Balanced Fund (Low Risk),Yes,Investment grade securities,Diversified
4,0000051931-18-000151,American Funds College 2030 Fund,,Balanced Fund (Low Risk),Yes,Investment grade securities,Diversified


In [7]:
# test dataset: the fund that did not have a label but is in the folder of summaries
test_funds = [fund for fund in fund_names if fund not in list(df_label['fund_name'])]
test_index = [fund_names.index(fund) for fund in fund_names if fund not in list(df_label['fund_name'])]

test_summaries = [summary for summary in summaries if summaries.index(summary) in test_index]

In [8]:
len(test_funds)

79

In [9]:
len(test_summaries)

79

In [10]:
df_label['Ivestment Strategy'].value_counts()

Equity Long Only (Low Risk)          248
Fixed Income Long Only (Low Risk)    130
Balanced Fund (Low Risk)              84
Long Short Funds (High Risk)           4
Commodities Fund (Low Risk)            1
Name: Ivestment Strategy, dtype: int64

In [11]:
#The only four high-risk fund and only one commodities should be removed from our model for it has too small amounts
remove_fund = df_label.loc[(df_label['Ivestment Strategy']=='Long Short Funds (High Risk)') | (df_label['Ivestment Strategy']=='Commodities Fund (Low Risk)'),'fund_name']
print(remove_fund)
remove_index = [fund_names.index(fund) for fund in fund_names if fund in list(remove_fund)]
remove_index

296                   Columbia Commodity Strategy Fund
301               Columbia Mortgage Opportunities Fund
423             Anchor Tactical Equity Strategies Fund
424          Anchor Tactical Municipal Strategies Fund
425    Dreyfus Alternative Diversifier Strategies Fund
Name: fund_name, dtype: object


[84, 85, 102, 112, 148]

In [12]:
#The second model:keep the four high-risk fund
remove_fund1 = df_label.loc[df_label['Ivestment Strategy']=='Commodities Fund (Low Risk)','fund_name']
print(remove_fund1)
remove_index1 = [fund_names.index(fund) for fund in fund_names if fund in list(remove_fund1)]
remove_index1

296    Columbia Commodity Strategy Fund
Name: fund_name, dtype: object


[102]

In [13]:
keep_funds = [fund for fund in fund_names if fund not in list(remove_fund)]
keep_fund_summaries = [summary for summary in summaries if summaries.index(summary) not in remove_index]
keep_funds1 = [fund for fund in fund_names if fund not in list(remove_fund1)]
keep_fund_summaries1 = [summary for summary in summaries if summaries.index(summary) not in remove_index1]

In [14]:
print(len(keep_funds),len(keep_fund_summaries))
print(len(keep_funds1),len(keep_fund_summaries1))

540 540
544 544


In [15]:
df_summary = pd.DataFrame(data={'fund_name':keep_funds, 'summary':keep_fund_summaries}) 
df_summary1 = pd.DataFrame(data={'fund_name':keep_funds1, 'summary':keep_fund_summaries1}) 

In [16]:
df_merge = df_summary.merge(df_label, on =['fund_name'], how ='left').dropna()
df_merge1 = df_summary1.merge(df_label, on =['fund_name'], how ='left').dropna()

In [17]:
df_merge['Ivestment Strategy'].value_counts()

Equity Long Only (Low Risk)          247
Fixed Income Long Only (Low Risk)    130
Balanced Fund (Low Risk)              84
Name: Ivestment Strategy, dtype: int64

In [18]:
df_merge1['Ivestment Strategy'].value_counts()

Equity Long Only (Low Risk)          247
Fixed Income Long Only (Low Risk)    130
Balanced Fund (Low Risk)              84
Long Short Funds (High Risk)           4
Name: Ivestment Strategy, dtype: int64

In [19]:
df_merge.tail()

Unnamed: 0,fund_name,summary,id,Performance fee?,Ivestment Strategy,Leverage?,Portfolio composition,Concentration
533,Variable Portfolio - Wells Fargo Short Duratio...,SUMMARY OF CTIVPSM – WELLS FARGO SHORT DURATIO...,0001193125-18-139001,,Fixed Income Long Only (Low Risk),Yes,Investment grade securities,Diversified
536,Western Asset Intermediate Maturity California...,Investment objective\n\nThe fund seeks to prov...,0001193125-18-091654,,Fixed Income Long Only (Low Risk),Yes,Investment grade securities,Concentrated by issuer / sector / jurisdiction
537,Western Asset Intermediate Maturity New York M...,Investment objective\n\nThe fund seeks to prov...,0001193125-18-091654,,Fixed Income Long Only (Low Risk),Yes,Investment grade securities,Concentrated by issuer / sector / jurisdiction
538,Western Asset Massachusetts Municipals Fund,Investment objective\n\nThe fund seeks to prov...,0001193125-18-091654,,Fixed Income Long Only (Low Risk),Yes,Investment grade securities,Concentrated by issuer / sector / jurisdiction
539,World Growth Fund,INVESTMENT OBJECTIVE\nThe USAA World Growth Fu...,0001683863-18-000339,Some performance Fees,Equity Long Only (Low Risk),No,Sub-investment grade securities or emerging ma...,Diversified


In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(df_merge['summary'], df_merge['Ivestment Strategy'], test_size=0.3, random_state=42)

In [21]:
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(df_merge1['summary'], df_merge1['Ivestment Strategy'], test_size=0.3, random_state=22)

In [22]:
len(X_train)

322

In [23]:
len(X_valid)

139

In [24]:
len(X_train1)

325

In [25]:
# stop words set
stop_words = set(stopwords.words("english")+list(string.punctuation)+['``',"''"]+["]","[","*"]+['doe', 'ha', 'wa'])

def tokenizer(txt):
    """tokenizer

    Args:
        txt (str): text to be tokenized

    Returns:
        filtered_sentence (list): list of tokenized string.  
    """
    txt = txt.lower().replace('\t', ' ').replace('\n', ' ')
    word_tokens = word_tokenize(txt)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = [w for w in filtered_sentence if re.sub("[^A-Za-z ]+",'',w) != ''] 
    return filtered_sentence
    
train_text_words = np.concatenate([tokenizer(s) for s in X_train])
    

## skip-gram model

In [26]:
# training parameters for skip-gram model
batch_size = 120
num_epochs = 2
# word2vec parameters
embedding_size = 50 # size of embedding vector
max_vocabulary_size = 5000 #  number of different words in vocabulary
min_occurrence = 10 # words must appear at least 10 times
skip_window = 3 # how many words to consider left and right
num_skips = 4 # how many times to reuse an input to generate a label

In [27]:
# Build the dictionary and replace rare words with UNK token
count = [('UNK', -1)]
# Count the most common words
count.extend(collections.Counter(train_text_words).most_common(max_vocabulary_size - 1))
# Remove samples with less than 'min_occurrence' occurrences
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached
        break

In [28]:

# give a unique id to each words in the vocabulary
word2id = dict()
for i, (word, _)in enumerate(count):
    word2id[word] = i
id2word = dict(zip(word2id.values(), word2id.keys()))
vocab_size = len(id2word)

unk_count = 0 # we still need to count the unkown words(the words that are not in the word2id)
data = [] # turn words in train_text_words into id numbers and put them in data

for word in train_text_words:
    index = word2id.get(word, 0)
    if index == 0: # if can't find the word in word2id
        unk_count += 1
    data.append(index) # append the id number
count[0] = ('UNK', unk_count) # update the UNK value of count

In [29]:
def to_one_hot(data_point_index, vocab_size):
    one_hot = np.zeros(vocab_size)
    one_hot[data_point_index] = 1
    return one_hot

def batch_generator(batch_size, num_skips, skip_window, vocab_size):
    """function to generator batch for model training

    Args:
        batch_size (int): size of batch
        num_skips (int): how many times to reuse an input to generate a label
        skip_window (int): how many words to consider left and right
        vocab_size (int): size of the word2id/id2word
    Yields:
        batch_one_hot
        labels_one_hot
    """    
    data_idx = 0
    while True:
        assert num_skips <= 2 * skip_window
        assert batch_size % num_skips == 0
        labels = np.ndarray(shape=(batch_size), dtype=np.int32)
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        span = skip_window * 2 + 1
        buffer = collections.deque(maxlen=span) #keep track of the visited indexes visited
        if data_idx + span > len(data):
            data_idx = 0
            # stop the loop
            break
        buffer.extend(data[data_idx:data_idx + span])
        data_idx += span
        for i in range(batch_size // num_skips):  
            # Take the context current word
            context_words = [w for w in range(span) if w != skip_window]
            # sample num_skips number of words
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j] = buffer[context_word]
            if data_idx == len(data):
                buffer.extend(data[0:span])
                data_idx = span
            else:
                buffer.append(data[data_idx])
                data_idx += 1
        
        data_idx = (data_idx + len(data) - span) % len(data)

        # translate word index to on-hot
        batch_one_hot = np.array([to_one_hot(b, vocab_size) for b in batch])
        labels_one_hot = np.array([to_one_hot(l, vocab_size) for l in labels])
        
        yield batch_one_hot, labels_one_hot # output one batch

In [30]:
# train the skip-gram model
def word2vec_model():
    input_w = Input(shape = (vocab_size,))
    encoded = Dense(embedding_size, activation='linear')(input_w)
    decoded = Dense(vocab_size, activation='softmax')(encoded)
    autoencoder = Model(input_w, decoded)
    encoder = Model(input_w, encoded)
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
    return encoder, autoencoder

encoder, autoencoder = word2vec_model()

In [None]:
autoencoder.fit(x = batch_generator(batch_size, num_skips, skip_window, vocab_size), steps_per_epoch=ceil(len(data) / batch_size), epochs=num_epochs)


Epoch 1/2

In [None]:
def save_word2vec(filename):
    with open(os.path.join('', filename),'a' , encoding='utf-8') as f :
        for k, v in word2vec.items():
            line = k+' '+str(list(v)).strip('[]').replace(',','')+'\n'
            f.write(line)

# Load a word2vec dictionary.
def load_word2vec(filename):
    word2vec = {}
    with open(os.path.join('', filename), encoding='utf8') as f:
        for line in f:
            try :
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
            except :
                None
    return word2vec

In [None]:
# use the encoder to vectorize
def vecotrize(word):
    w_one_hot = to_one_hot(word2id[word], vocab_size)
    return encoder.predict(np.array([w_one_hot]))[0]

# create the word2vec dictionary then save it.
word2vec = {w : vecotrize(w) for w in word2id.keys()}
save_word2vec('train_word2vec')

In [None]:
def lemma_tokenizer(text):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text.replace("'"," "))]

stop_words = set(stopwords.words("english")+list(string.punctuation)+['``',"''","’"]+["]","[","*"]+['doe', 'ha', 'wa'] +['--']+ [''])

In [None]:
max_features = 18
tfidf = TfidfVectorizer(input='content', tokenizer=lemma_tokenizer, stop_words=stop_words, max_features=max_features)
tfidf1 = TfidfVectorizer(input='content', tokenizer=lemma_tokenizer, stop_words=stop_words, max_features=max_features)
# Fits the tfidf vecotizer on the train sample and create the training features.

tfidf_train = tfidf.fit_transform(X_train) #update
# Uses the vectorizer to create the test features.
tfidf_train1 = tfidf1.fit_transform(X_train1)

In [None]:
key_words = tfidf.get_feature_names() # Common keywords
print(key_words)
key_words1 = tfidf1.get_feature_names() # Common keywords
print(key_words1)

In [None]:
def get_n_closer(word, n, word2vec):
    vect = word2vec[word]
    dist_dict = {k: cosine(v, vect) for k, v in word2vec.items()}
    closer_words = []
    for _ in range(n):
        min_key = min(dist_dict.keys(), key=lambda k: dist_dict[k])
        closer_words.append(min_key)
        del dist_dict[min_key]
    return closer_words

##knowledge base
def create_knowledge_base(num_neighbors, word2vec, key_words):
    knowledge_base = set()
    out = display(progress(0, len(key_words)-1), display_id=True)
    for ii, key_word in enumerate(key_words) :
        knowledge_base.add(key_word)
        neighbors = []
        try :
            neighbors = get_n_closer(key_word, num_neighbors, word2vec)
        except :
            print(key_word + ' not in word2vec')

        knowledge_base.update(neighbors)
        
        out.update(progress(ii, len(key_words)-1))
    return knowledge_base
            

In [None]:
knowledge_base = create_knowledge_base(5, word2vec, key_words)
knowledge_base1 = create_knowledge_base(5, word2vec, key_words1)
print(knowledge_base)
print(knowledge_base1)

In [None]:
# Takes a summary, the knowledge base and some hyper parameters and returns the "num_sent" sentences
# of the summary that are closer to the the knowledge base in term of spacial distances.
def extract_sentence_distance(summary, knowledge, n_closer, n_reject, num_sent):
    # Split the summary into sentences.
    sentences = sent_tokenize(summary)
    sentence_scores = []
    # Loop over the sentences.
    for j, sentence in enumerate(sentences):
        # we tokenize and clean the sentence
        tokens = tokenizer(sentence)

        sentence_barycentre = np.zeros(embedding_size)
        effective_len = 0
        # Compute the barycentre of the sentence
        for token in tokens :
            try :
                sentence_barycentre += np.array(word2vec[token])
                effective_len += 1
            except KeyError :
                pass
            except :
                raise
        
        # Reject sentences with less than n_reject words in our word2vec map
        if effective_len <= n_reject :
            sentence_scores.append(1)    

        else :
            sentence_barycentre = sentence_barycentre/effective_len
            # Compute the distance sentece_barycentre -> words in our knowledge base
            barycentre_distance = [cosine(sentence_barycentre, word2vec[key_word]) for key_word in knowledge]
            barycentre_distance.sort()
            # Create the score of the sentence by averaging the "n_closer" smallest distances
            score = np.mean(barycentre_distance[:n_closer])
            sentence_scores.append(score)
    # Select the "num_sent" sentences that have the smallest score (smallest distance score with the knowledge base)
    sentence_scores, sentences = zip(*sorted(zip(sentence_scores, sentences)))
    top_sentences = sentences[:num_sent]
    return ' '.join(top_sentences)



## Q4 Measure Distance

In [None]:
#prepare the train,validation and test dataframe
X_train_df = pd.DataFrame(X_train)
X_valid_df = pd.DataFrame(X_valid)
X_train_df1 = pd.DataFrame(X_train1)
X_valid_df1 = pd.DataFrame(X_valid1)
X_test_df = pd.DataFrame({'summary':test_summaries})
X_test_df1 = pd.DataFrame({'summary':test_summaries})

In [None]:
X_test_df

In [None]:
X_train_df['sentences_distance'] = X_train_df.apply(lambda x : extract_sentence_distance(x['summary'], knowledge_base, n_closer=10, n_reject=5, num_sent=5), axis=1)
X_valid_df['sentences_distance'] = X_valid_df.apply(lambda x : extract_sentence_distance(x['summary'], knowledge_base, n_closer=10, n_reject=5, num_sent=5), axis=1)
X_train_df1['sentences_distance'] = X_train_df1.apply(lambda x : extract_sentence_distance(x['summary'], knowledge_base1, n_closer=10, n_reject=5, num_sent=5), axis=1)
X_valid_df1['sentences_distance'] = X_valid_df1.apply(lambda x : extract_sentence_distance(x['summary'], knowledge_base1, n_closer=10, n_reject=5, num_sent=5), axis=1)

In [None]:
X_test_df['sentences_distance'] = X_test_df.apply(lambda x : extract_sentence_distance(x['summary'], knowledge_base, n_closer=10, n_reject=5, num_sent=5), axis=1)
X_test_df1['sentences_distance'] = X_test_df1.apply(lambda x : extract_sentence_distance(x['summary'], knowledge_base1, n_closer=10, n_reject=5, num_sent=5), axis=1)

In [None]:
def extract_sentence_match(summary, knowledge, num_sent):
    sentences = sent_tokenize(summary)
    sentence_scores = []
    for j, sentence in enumerate(sentences):
        set_tokens = set(tokenizer(sentence))

        # Find the number of common words between the knowledge base and the sentence
        inter_knwoledge = set_tokens.intersection(knowledge)

        sentence_scores.append(len(inter_knwoledge))

    sentence_scores, sentences = zip(*sorted(zip(sentence_scores, sentences)))
    top_sentences = sentences[len(sentences)-num_sent-1:]
    return ' '.join(top_sentences)
X_train_df['sentences_match'] = X_train_df.apply(lambda x : extract_sentence_match(x['summary'], knowledge_base, num_sent=5), axis=1)
X_valid_df['sentences_match'] = X_valid_df.apply(lambda x : extract_sentence_match(x['summary'], knowledge_base, num_sent=5), axis=1)
X_train_df1['sentences_match'] = X_train_df1.apply(lambda x : extract_sentence_match(x['summary'], knowledge_base1, num_sent=5), axis=1)
X_valid_df1['sentences_match'] = X_valid_df1.apply(lambda x : extract_sentence_match(x['summary'], knowledge_base1, num_sent=5), axis=1)
X_test_df['sentences_match'] = X_test_df.apply(lambda x : extract_sentence_match(x['summary'], knowledge_base, num_sent=5), axis=1)
X_test_df1['sentences_match'] = X_test_df1.apply(lambda x : extract_sentence_match(x['summary'], knowledge_base1, num_sent=5), axis=1)

In [None]:
# produce train_X and test_X
train_X = X_train_df['sentences_match'].values
train_X = [' '.join(tokenizer(txt)) for txt in train_X]

valid_X = X_valid_df['sentences_match'].values
valid_X = [' '.join(tokenizer(txt)) for txt in valid_X]

test_X = X_test_df['sentences_match'].values
test_X = [' '.join(tokenizer(txt)) for txt in test_X]

train_X1 = X_train_df1['sentences_match'].values
train_X1 = [' '.join(tokenizer(txt)) for txt in train_X1]

valid_X1 = X_valid_df1['sentences_match'].values
valid_X1 = [' '.join(tokenizer(txt)) for txt in valid_X1]

test_X1 = X_test_df1['sentences_match'].values
test_X1 = [' '.join(tokenizer(txt)) for txt in test_X1]

In [None]:
# produce train_y and valid_y
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()

encoded_train_y = encoder.fit_transform(y_train)
label_train_y = np_utils.to_categorical(encoded_train_y,num_classes=3)


encoded_valid_y = encoder.transform(y_valid)
label_valid_y = np_utils.to_categorical(encoded_valid_y,num_classes=3)

encoded_train_y1 = encoder.fit_transform(y_train1)
label_train_y1 = np_utils.to_categorical(encoded_train_y1,num_classes=4)


encoded_valid_y1 = encoder.transform(y_valid1)
label_valid_y1 = np_utils.to_categorical(encoded_valid_y1,num_classes=4)

In [None]:
label_train_y

In [None]:
label_train_y1

In [None]:
num_words = 2500 # Size of the vocabulary used. we only consider the 2500 most common words. The other words are removed from the texts.
maxlen = 150 # Number of word considered for each document. we cut or lengthen the texts to have texts of 150 words.
word_dimension = 50 # dimension of our word vectors.

In [None]:
keras_tokenizer = Tokenizer(num_words=num_words)
keras_tokenizer1 = Tokenizer(num_words=num_words)
keras_tokenizer.fit_on_texts(train_X)
keras_tokenizer1.fit_on_texts(train_X1)
word_index = keras_tokenizer.word_index
word_index1 = keras_tokenizer1.word_index
sequences_train = keras_tokenizer.texts_to_sequences(train_X)
sequences_valid = keras_tokenizer.texts_to_sequences(valid_X)
sequences_test = keras_tokenizer.texts_to_sequences(test_X)
sequences_train1 = keras_tokenizer1.texts_to_sequences(train_X1)
sequences_valid1 = keras_tokenizer1.texts_to_sequences(valid_X1)
sequences_test1 = keras_tokenizer1.texts_to_sequences(test_X1)
# truncate or lenthen each text so they have the same length.
feature_train = pad_sequences(sequences_train, maxlen=maxlen, dtype=float, padding='post', truncating='post')
feature_valid = pad_sequences(sequences_valid, maxlen=maxlen, dtype=float, padding='post', truncating='post')
feature_test = pad_sequences(sequences_test, maxlen=maxlen, dtype=float, padding='post', truncating='post')
feature_train1 = pad_sequences(sequences_train1, maxlen=maxlen, dtype=float, padding='post', truncating='post')
feature_valid1 = pad_sequences(sequences_valid1, maxlen=maxlen, dtype=float, padding='post', truncating='post')
feature_test1 = pad_sequences(sequences_test1, maxlen=maxlen, dtype=float, padding='post', truncating='post')

In [None]:
# create our embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, word_dimension))
for word, i in word_index.items():
    embedding_vector = word2vec.get(word)   
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix1 = np.zeros((len(word_index1) + 1, word_dimension))
for word, i in word_index1.items():
    embedding_vector = word2vec.get(word)   
    if embedding_vector is not None:
        embedding_matrix1[i] = embedding_vector


In [None]:
len(feature_train)

In [None]:
len(label_train_y)

In [None]:
len(feature_train1)

In [None]:
len(label_train_y1)

## Q5 training

CNN

In [None]:
def create_CNN_model():
    CNN = Sequential()
    # The Embedding layer takes the embedding matrix as an argument and transform the inputed the sequences of index to sequences of vectors.
    CNN.add(Embedding(len(word_index) + 1, word_dimension, weights=[embedding_matrix], input_length = maxlen, trainable=False))


    CNN.add(Convolution1D(64, 5, activation = 'relu'))
    CNN.add(MaxPooling1D(pool_size = 5))

    CNN.add(Convolution1D(32, 5, activation = 'relu'))
    CNN.add(MaxPooling1D(pool_size = 5))

    CNN.add(Flatten())
    CNN.add(Dense(units = 128 , activation = 'relu'))
    CNN.add(Dropout(0.5))
    CNN.add(Dense(units = 3, activation = 'softmax'))

    CNN.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return CNN
    
CNN_model = create_CNN_model()
print(feature_train)
CNN_history = CNN_model.fit(feature_train, label_train_y, epochs=800, batch_size=100)

In [None]:
def create_CNN_model1():
    CNN = Sequential()
    # The Embedding layer takes the embedding matrix as an argument and transform the inputed the sequences of index to sequences of vectors.
    CNN.add(Embedding(len(word_index1) + 1, word_dimension, weights=[embedding_matrix1], input_length = maxlen, trainable=False))


    CNN.add(Convolution1D(64, 5, activation = 'relu'))
    CNN.add(MaxPooling1D(pool_size = 5))

    CNN.add(Convolution1D(32, 5, activation = 'relu'))
    CNN.add(MaxPooling1D(pool_size = 5))

    CNN.add(Flatten())
    CNN.add(Dense(units = 128 , activation = 'relu'))
    CNN.add(Dropout(0.5))
    CNN.add(Dense(units = 4, activation = 'softmax'))

    CNN.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return CNN
    
CNN_model1 = create_CNN_model1()
print(feature_train)
CNN_history1 = CNN_model1.fit(feature_train1, label_train_y1, epochs=800, batch_size=100)

In [None]:
y_valid_CNN = CNN_model.predict(feature_valid)
y_valid_CNN1 = CNN_model1.predict(feature_valid1)

In [None]:
len(y_valid_CNN)

In [None]:
# convert the validation vector 
valid_y_CNN = y_valid_CNN.copy()
for i in range(len(y_valid_CNN)):
    j = np.where(y_valid_CNN[i] == np.amax(y_valid_CNN[i]))
    valid_y_CNN[i] = [0, 0, 0]
    valid_y_CNN[i][j] = 1
valid_y_CNN1 = y_valid_CNN1.copy()
for i in range(len(y_valid_CNN1)):
    j = np.where(y_valid_CNN1[i] == np.amax(y_valid_CNN1[i]))
    valid_y_CNN1[i] = [0, 0, 0, 0]
    valid_y_CNN1[i][j] = 1

In [None]:
plt.plot(CNN_history.history['accuracy'])
plt.title('CNN Model accuracy with class=3')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()

In [None]:
# Plot training loss values
plt.plot(CNN_history.history['loss'])
plt.title('CNN Model loss with class=3')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Loss'], loc='upper left')
plt.show()

In [None]:
plt.plot(CNN_history1.history['accuracy'])
plt.title('CNN Model accuracy with class=4')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()

In [None]:
# Plot training loss values
plt.plot(CNN_history1.history['loss'])
plt.title('CNN Model loss with class=4')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Loss'], loc='upper left')
plt.show()

In [None]:
print(accuracy_score(label_valid_y,valid_y_CNN))

print(classification_report(label_valid_y,valid_y_CNN))

print("auc score: ",roc_auc_score(label_valid_y,valid_y_CNN))

In [None]:
print(accuracy_score(label_valid_y1,valid_y_CNN1))

print(classification_report(label_valid_y1,valid_y_CNN1))

RNN

In [None]:
def create_RNN_model():
    RNN = Sequential()
    RNN.add(Embedding(len(word_index) + 1, word_dimension, weights=[embedding_matrix], input_length = maxlen, trainable=False))

    RNN.add(Bidirectional(LSTM(word_dimension)))
    RNN.add(Dense(word_dimension, activation='relu'))
    RNN.add(Dense(3, activation='softmax'))
    RNN.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
    return RNN

In [None]:
RNN_model = create_RNN_model()
RNN_history = RNN_model.fit(feature_train, label_train_y, epochs=800, batch_size=64)

In [None]:
# Plot training & validation accuracy
plt.plot(RNN_history.history['accuracy'])
plt.title(' RNN Model accuracy with class=3')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()

In [None]:
# Plot training loss values
plt.plot(RNN_history.history['loss'])
plt.title('RNN Model loss with class=3')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Loss'], loc='upper left')
plt.show()

In [None]:
# validation with the RNN 
y_valid_RNN = RNN_model.predict(feature_valid)
valid_y_RNN = y_valid_RNN.copy()
for i in range(len(y_valid_RNN)):
    j = np.where(y_valid_RNN[i] == np.amax(y_valid_RNN[i]))
    valid_y_RNN[i] = [0, 0, 0]
    valid_y_RNN[i][j] = 1
  
# print acc and report
print(accuracy_score(label_valid_y,valid_y_RNN))

print(classification_report(label_valid_y,valid_y_RNN))

In [None]:
def create_RNN_model1():
    RNN = Sequential()
    RNN.add(Embedding(len(word_index1) + 1, word_dimension, weights=[embedding_matrix1], input_length = maxlen, trainable=False))

    RNN.add(Bidirectional(LSTM(word_dimension)))
    RNN.add(Dense(word_dimension, activation='relu'))
    RNN.add(Dense(4, activation='softmax'))
    RNN.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
    return RNN

In [None]:
RNN_model1 = create_RNN_model1()
RNN_history1 = RNN_model1.fit(feature_train1, label_train_y1, epochs=800, batch_size=64)

In [None]:
# Plot training & validation accuracy
plt.plot(RNN_history1.history['accuracy'])
plt.title(' RNN Model accuracy with class=4')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()

In [None]:
# Plot training loss values
plt.plot(RNN_history1.history['loss'])
plt.title('RNN Model loss with class=4')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Loss'], loc='upper left')
plt.show()

In [None]:
# validation with the RNN 
y_valid_RNN1 = RNN_model1.predict(feature_valid1)
valid_y_RNN1 = y_valid_RNN1.copy()
for i in range(len(y_valid_RNN1)):
    j = np.where(y_valid_RNN1[i] == np.amax(y_valid_RNN1[i]))
    valid_y_RNN1[i] = [0, 0, 0, 0]
    valid_y_RNN1[i][j] = 1
  
# print acc and report
print(accuracy_score(label_valid_y1,valid_y_RNN1))

print(classification_report(label_valid_y1,valid_y_RNN1))

## Prediction

In [None]:
y_pred_CNN = CNN_model.predict(feature_test)
pred_y_CNN = y_pred_CNN.copy()
for i in range(len(y_pred_CNN)):
    j = np.where(y_pred_CNN[i] == np.amax(y_pred_CNN[i]))
    pred_y_CNN[i] = [0, 0, 0]
    pred_y_CNN[i][j] = 1
    
y_pred_RNN = RNN_model.predict(feature_test)
pred_y_RNN = y_pred_RNN.copy()
for i in range(len(y_pred_RNN)):
    j = np.where(y_pred_RNN[i] == np.amax(y_pred_RNN[i]))
    pred_y_RNN[i] = [0, 0, 0]
    pred_y_RNN[i][j] = 1
    


In [None]:
cnn_pred_y = pd.Series(np.zeros(len(pred_y_CNN)))
for i in range(len(pred_y_CNN)):
    j = np.where(pred_y_CNN[i] == 1)
    if j == (np.array([0]),):
        cnn_pred_y[i] = 'Balanced Fund (Low Risk)'
    elif j == (np.array([1]),):
        cnn_pred_y[i] = 'Equity Long Only (Low Risk)'
    elif j == (np.array([2]),):
        cnn_pred_y[i] = 'Fixed Income Long Only (Low Risk)'


cnn_predict = pd.DataFrame({'fund name':test_funds,'CNN prediction':cnn_pred_y})
print(cnn_predict)

In [None]:
rnn_pred_y = pd.Series(np.zeros(len(pred_y_RNN)))
for i in range(len(pred_y_RNN)):
    j = np.where(pred_y_RNN[i] == 1)
    if j == (np.array([0]),):
        rnn_pred_y[i] = 'Balanced Fund (Low Risk)'
    elif j == (np.array([1]),):
        rnn_pred_y[i] = 'Equity Long Only (Low Risk)'
    elif j == (np.array([2]),):
        rnn_pred_y[i] = 'Fixed Income Long Only (Low Risk)'


rnn_predict = pd.DataFrame({'fund name':test_funds,'RNN prediction':rnn_pred_y})
print(rnn_predict)