In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
from os import listdir
import nltk
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate, TimeDistributed, Dense
from tensorflow.keras.layers import Embedding, GRU
import gensim
import gensim.downloader as model_api
import sklearn.feature_extraction.text as text
from sklearn.decomposition import PCA
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Input
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import random

# 1. Sentiment analysis

Using the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), we want to do a regression model that predict the ratings are on a 1-10 scale. You have an example train and test set in the `dataset` folder.

### 1.1 Regression Model

Use a feedforward neural network and NLP techniques we've seen up to now to train the best model you can on this dataset

### 1.2 RNN model

Train a RNN to do the sentiment analysis regression. The RNN should consist simply of an embedding layer (to make word IDs into word vectors) a recurrent blocks (GRU or LSTM) feeding into an output layer.

In [11]:
pos_train_files = listdir("data/train/pos")
neg_train_files = listdir("data/train/neg")

pos_test_files = listdir("data/test/pos")
neg_test_files = listdir("data/test/neg")



In [16]:
def get_reviews(target, rev, files): 
    x = []
    x_line = []

    for file in files:
        with open (f"data/{target}/{rev}/{file}", encoding="utf8") as opened_file:
            rating = file.split("_")[1].split(".")[0]

            for line in opened_file:
                x_line = []
                x_line.append(line)
                x_line.append(rating)
                x.append(x_line)
                
    return x

In [17]:
train_pos = pd.DataFrame(columns=["review", "rating"], data=get_reviews("train", "pos", pos_train_files))
train_neg = pd.DataFrame(columns=["review", "rating"], data=get_reviews("train", "neg", neg_train_files))

In [18]:
test_pos = pd.DataFrame(columns=["review", "rating"], data=get_reviews("test", "pos", pos_test_files))
test_neg = pd.DataFrame(columns=["review", "rating"], data=get_reviews("test", "neg", neg_test_files))

In [19]:
train_df = pd.concat([train_pos, train_neg], ignore_index=True)
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

## 1.1 Regression Model
Use a feedforward neural network and NLP techniques we've seen up to now to train the best model you can on this dataset

In [20]:
sw = stopwords.words("english")
pca = PCA(n_components=1000)

In [21]:
df = train_df.sample(n=1000, random_state=42)
df = df.reset_index(drop=True)
df.rating = df.rating.astype("float")

In [22]:
df.review = df.review.apply(lambda t: " ".join([t for t in t.replace("<br />", "").lower().split(" ") if not t in sw]))

In [23]:
tf = text.TfidfVectorizer()
X = tf.fit_transform(df['review'])
X = X.toarray()

In [24]:
X = pca.fit_transform(X)

In [25]:
df["rev_tfidf"] = [x for x in X]

In [26]:
df.head()

Unnamed: 0,review,rating,rev_tfidf
0,panic streets richard widmark plays u.s. navy ...,8.0,"[-0.06693332477562777, 0.026501666720696388, 0..."
1,ask first one really better one. look sarah m....,1.0,"[0.054588756233047195, 0.007289225635799655, 0..."
2,big fan faerie tale theatre i've seen one best...,10.0,"[0.04245640578172975, -0.055399678198844185, 0..."
3,finished reading book dillinger. movie horribl...,1.0,"[0.03061162246109369, -0.045177533976975846, -..."
4,greg davis bryan daly take crazed statements t...,2.0,"[-0.07803153617788054, 0.0048380035415081105, ..."


In [27]:
loss_stopper = EarlyStopping(monitor="loss", patience=1)

In [28]:
model = Sequential()

model.add(Input(shape=X.shape[-1]))
model.add(Dropout(0.2))

model.add(Dense(50))
model.add(Dropout(0.2))

model.add(Dense(50))
model.add(Dropout(0.2))

model.add(Dense(1))

model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [29]:
from numpy.random import seed
seed(42)
from tensorflow.random import set_seed
set_seed(42)
model.fit(x=X, y=df.rating, batch_size=1, epochs=25, callbacks=[loss_stopper]);

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25


In [30]:
df_test = test_df.sample(n=1000, random_state=42)
df_test = df_test.reset_index(drop=True)
df_test.rating = df_test.rating.astype("float")

In [31]:
tf = text.TfidfVectorizer()
Xt = tf.fit_transform(df_test['review'])
Xt = Xt.toarray()

In [33]:
Xt = pca.fit_transform(Xt)

In [34]:
preds = model.predict(Xt)

In [35]:
preds = preds.flatten()

In [36]:
for i in range(len(preds)):
    preds[i] = round(preds[i])

In [37]:
accuracy_score(preds, df_test.rating.values)

0.085

## 1.2 RNN model
Train a RNN to do the sentiment analysis regression. The RNN should consist simply of an embedding layer (to make word IDs into word vectors) a recurrent blocks (GRU or LSTM) feeding into an output layer.

In [38]:
def get_tag(token):
    
    tags = []
    
    for tag in nltk.pos_tag(token):
        tags.append(tag[1])
    
    return tags

In [39]:
df = train_df.sample(n=1000, random_state=42)
df = df.reset_index(drop=True)
df.rating = df.rating.astype("float")

In [40]:
df["rev_token"] = df["review"].apply(lambda x: nltk.word_tokenize(x))
# df["rev_tag"] = df["rev_token"].apply(lambda x: get_tag(x))


In [41]:
df.head()

Unnamed: 0,review,rating,rev_token
0,In Panic In The Streets Richard Widmark plays ...,8.0,"[In, Panic, In, The, Streets, Richard, Widmark..."
1,If you ask me the first one was really better ...,1.0,"[If, you, ask, me, the, first, one, was, reall..."
2,I am a big fan a Faerie Tale Theatre and I've ...,10.0,"[I, am, a, big, fan, a, Faerie, Tale, Theatre,..."
3,I just finished reading a book about Dillinger...,1.0,"[I, just, finished, reading, a, book, about, D..."
4,Greg Davis and Bryan Daly take some crazed sta...,2.0,"[Greg, Davis, and, Bryan, Daly, take, some, cr..."


In [42]:
def make_lexicon(token_seqs, min_freq=1):
    token_counts = {}
    for seq in token_seqs:
        for token in seq:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1

    lexicon = [token for token, count in token_counts.items() if count >= min_freq]

    lexicon = {token:idx + 2 for idx,token in enumerate(lexicon)}
    lexicon[u'<UNK>'] = 1 
    lexicon_size = len(lexicon)

    return lexicon

rev_lexicon = make_lexicon(df['rev_token'])
# tag_lexicon = make_lexicon(df['rev_tag'])

In [43]:
def get_lexicon_lookup(lexicon):

    lexicon_lookup = {idx: lexicon_item for lexicon_item, idx in lexicon.items()}
    return lexicon_lookup

def tokens_to_idxs(token_seqs, lexicon):
    idx_seqs = [[lexicon[token] if token in lexicon else lexicon['<UNK>'] for token in token_seq] for token_seq in token_seqs]
    return idx_seqs

df['Sentence_Idxs'] = tokens_to_idxs(df['rev_token'], rev_lexicon)
# df['Tag_Idxs'] = tokens_to_idxs(df['rev_tag'], tag_lexicon)

# tags_lexicon_lookup = get_lexicon_lookup(tag_lexicon)

In [44]:
def pad_idx_seqs(idx_seqs, max_seq_len):
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len)
    return padded_idxs

max_seq_len = max([len(idx_seq) for idx_seq in df['Sentence_Idxs']])

train_padded_words = pad_idx_seqs(df['Sentence_Idxs'], max_seq_len + 1)
# train_padded_tags = pad_idx_seqs(df['Tag_Idxs'], max_seq_len + 1)

In [45]:
def create_model(seq_input_len, n_input_nodes, n_embedding_nodes, n_hidden_nodes, stateful=False, batch_size=20):
    
    input_layer = Input(shape=(None,))
    
    #Layer 2
    embedding_layer = Embedding(input_dim=n_input_nodes,
                                output_dim=n_embedding_nodes,
                                mask_zero=True)(input_layer) 
    
    # Layer 3
    gru_layer = GRU(units=n_hidden_nodes)(embedding_layer)

    #Layer 4
    output_layer = Dense(units=1)(gru_layer)

    model = Model(inputs=[input_layer], outputs=output_layer)
    model.compile(loss="mean_squared_error", optimizer='adam')
    
    return model

In [46]:
model = create_model(seq_input_len=train_padded_words.shape[-1] - 1,
                     n_input_nodes=len(rev_lexicon) + 1,
                     n_embedding_nodes=300,
                     n_hidden_nodes=500)

In [47]:
model.fit(x=train_padded_words[:,1:], y=df.rating, batch_size=20, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2040ba094f0>

In [48]:
test_df = pd.concat([test_pos, test_neg], ignore_index=True)

In [49]:
test_df = test_df.sample(n=1000, random_state=42)
test_df = test_df.reset_index(drop=True)
test_df["rev_token"] = test_df["review"].apply(lambda x: nltk.word_tokenize(x))
# test_df["rev_tag"] = test_df["rev_token"].apply(lambda x: get_tag(x))

In [50]:
test_rev_lexicon = make_lexicon(test_df['rev_token'])
# test_tag_lexicon = make_lexicon(test_df['rev_tag'])

# test_tags_lexicon_lookup = get_lexicon_lookup(test_tag_lexicon)

test_df['Sentence_Idxs'] = tokens_to_idxs(test_df['rev_token'], test_rev_lexicon)
# test_df['Tag_Idxs'] = tokens_to_idxs(test_df['rev_tag'], test_tag_lexicon)

In [51]:
max_seq_len = max([len(idx_seq) for idx_seq in test_df['Sentence_Idxs']])

test_padded_words = pad_idx_seqs(test_df['Sentence_Idxs'], max_seq_len + 1)

In [52]:
preds = model.predict(test_padded_words[:,1:])

In [53]:
preds = preds.flatten()
for i in range(len(preds)):
    preds[i] = round(preds[i])

In [54]:
accuracy_score(preds, test_df.rating)

0.0

# 2. (evil) XOR Problem

Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below:

### 2.1 

Generate a dataset of random <=100,000 binary strings of equal length <= 50. Train the LSTM; what is the maximum length you can train up to with precisison?


In [60]:
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Input, LSTM
from tensorflow.keras.models import Sequential
import numpy as np
import random

In [55]:
SEQ_LEN = 50
COUNT = 100000

In [56]:
bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)])
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training])

In [61]:
model = Sequential()

model.add(Input(shape=(SEQ_LEN, 2), dtype='float32'))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
model.fit(training, target, epochs=10, batch_size=128);

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

randomly selected sequence: [0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0 1
 1 0 1 0 0 0 1 1 1 0 1 1 1]
prediction: 0
confidence: 99.22%
actual: 0


### 2.2

Generate a dataset of random <=200,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?


In [64]:
SEQ_LEN = 50
COUNT = 200000

In [65]:
bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)])
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training])

In [66]:
model = Sequential()

model.add(Input(shape=(SEQ_LEN, 2), dtype='float32'))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [67]:
model.fit(training, target, epochs=10, batch_size=128);

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [68]:
predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

randomly selected sequence: [0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0
 1 1 0 1 0 0 0 0 1 1 1 0 0]
prediction: 0
confidence: 100.00%
actual: 0
