# Assignment 3, due May 4, 10pm


## In this assignment you will classify the a collection of Yelp reviews into positive and negative.

## You will be using 1D CNNs (word n-grams) for document classification

## The models for this assignment are taken from 
### https://realpython.com/python-keras-text-classification/

## You can use this site to learn more about the models used in this assignment

In [None]:
import random
import pandas as pd
import numpy as np
import math

In [None]:
# Get the data.
# The corresponding file should be stored in the same directory as your assignment file. 

df = pd.read_csv('yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
print(df.iloc[0])

In [None]:
# remove those data points that do not have any labels

df = df[df['label'].notnull()] # remove records with no label (bneither positive nor negative)
df.head()

In [None]:
# the final data set that will be used in the experiments should have 1000 data points.
# check it. 

len(df)

In [None]:
from sklearn.model_selection import train_test_split

sentences =df['sentence'].values
y = df['label'].values

# split the data set into train and test data sets

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the input 

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

X_train

In [None]:
# Assignment 3. Question 1.
# Look at the output above and explain the size of the matrix
print ("Your answer is here")

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

In [None]:
from keras.models import Sequential
from keras import layers

input_dim = X_train.shape[1]  # Number of features


In [None]:
# function for plotting the results

import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(sentences_train[2])
print(X_train[2])

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [None]:
from keras.models import Sequential
from keras import layers

In [None]:
embedding_dim = 100

# Assignment 3. Question 2.
# Read about the Embedding Keras layer (not the same as the word embeddings!!)
# Experiment with different embedding parameters. 
# report the results

# Assignment 3. Question 3.
# Read about the Keras CNN. It has several parameters, including the kernal_size (the convolution 
# window size).
# Experiment with different CNN parameters. 
# report the results

# Assignment 3. Question 4.
# Experiment with other hyperparameters: maxlen of sentneces, vocabulary size, number of filters for the 
# convolutional network.
# report the results

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(512, 4, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# Assignment 3. Optional question.
# Use the CNN model from this assigment to classify the movie reviews from assignment 2.