**Classification**

Now that the fake reviews have been generated using the re-trained gpt2 model, its time to prepare the datasets for final use within the classification model; this is done through standardizing the data and then tokenisation. 

In [1]:
import os
import numpy as np
from tensorflow import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'

#Load datasets
fake_reviews = pd.read_csv('reviews_generated.csv',usecols=['text'])
real_reviews = pd.read_csv('bigreviews.csv',usecols=['text'])

#Add new column indicating real or fake
#real = 1 / fake = 0
real_reviews['real'] = 1
fake_reviews['real'] = 0


*Rate of 33% of reviews suspected to be fake, so dataset will be made with this concept in mind*

Generated = 7252 

Real = 14508

Total = 21756

In [60]:
#Select the last 21756 reviews
real_reviews = real_reviews.tail(14504)

In [61]:
#Join databases
full_reviews = pd.concat([real_reviews, fake_reviews], ignore_index=True)
full_reviews.to_csv('full_reviews.csv', index=False)

In [62]:
print(full_reviews)

                                                    text  real
0      My MacBook Pro retina was failing do the stupi...     1
1      My boyfriend and I found this place doing a lo...     1
2      Hubby and I decided to try.  Never been to Ger...     1
3      Ok so this is really Aneu! I really don't know...     1
4      Finally got to try Smee's recently.  I like th...     1
...                                                  ...   ...
21751  we were looking for a place to eat and we foun...     0
21752  second time here.  the food is good, but the s...     0
21753  tucked on 76, it's a great place to go to for ...     0
21754  these hand grenades are the best! \n\nthe staf...     0
21755  this is totallly a great place to go for a cas...     0

[21756 rows x 2 columns]


*Clean the dataset*

In [3]:
full_reviews = pd.read_csv('full_reviews.csv')

In [6]:
#Standardization and spell check
import itertools
import re
from autocorrect import Speller
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


def correct_text(text):
    #One letter in a word should not be present more than twice in continuation
    text_correction = ''.join(''.join(s)[:3] for _, s in itertools.groupby(text))
    #Apply autocorrection to the corrected text
    spell = Speller(lang='en')
    ans = spell(text_correction)
    return ans


def standardize_text(text):
    #Remove unicode characters
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    #Turn to lower case
    text = text.lower()
    #Remove numbers
    text = re.sub(r'\d+', '', text)
    #Remove punctuation
    text = re.sub("[^-9A-Za-z ]", "" , text)
    #Remove double spaces
    text = re.sub('\s{2,}', ' ', text)
    return text


def remove_spacing(text):
    #Remove all spaces and replace them with a single space
    text = re.sub(r'\s+', ' ', text)
    
    #Remove spaces before and after punctuation
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r'([.,!?])\s+', r'\1', text)
    
    return text.strip()

#Implement lemmatization, group words by root stem but keep the different tenses 
lemmatizer = WordNetLemmatizer()

def lemm_text(text):
    ans = lemmatizer.lemmatize(text)
    return ans

In [64]:
full_reviews['text'] = full_reviews['text'].apply(correct_text)
full_reviews['text'] = full_reviews['text'].apply(standardize_text)
full_reviews['text'] = full_reviews['text'].apply(lemm_text)
full_reviews['text'] = full_reviews['text'].apply(remove_spacing)

full_reviews = shuffle(full_reviews)

full_reviews.to_csv('full_reviews_cleaned.csv', index=False)

In [3]:
full_reviews = pd.read_csv('full_reviews_cleaned.csv')

In [15]:
#Prepare dataset for model use 
#Get the max token from data to be used in model
max_length = 0
for row in full_reviews['text']:
    if len(row.split(" ")) > max_length:
        max_length = len(row.split(" "))

print(max_length)

871


In [4]:
mini_review_batch = full_reviews[0:5000]

In [7]:
#Create datasets of predictors and labels
predictors_mini = mini_review_batch['text'].values 
labels_mini = mini_review_batch['real'].values

#Split the data into test / train 70%/30%
x_train, x_test, y_train, y_test = train_test_split(predictors_mini, labels_mini, test_size=0.3, shuffle=False)

In [8]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3500,)
(3500,)
(1500,)
(1500,)


In [5]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

#Reshape the outputs for use in models
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

#Apply the one hot encode
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [13]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3500,)
(3500,)
(1500,)
(1500,)


*Create Embeddings*

In [11]:
# Load Pretrained Word2Vec
import tensorflow_hub as hub
embed = hub.load('Embed')

In [12]:
def get_word2vec_enc(reviews):
    #store the encoded reviews
    encoded_reviews = []
    #Iterate through the list of reviews
    for review in reviews:
        #split review
        tokens = review.split(" ")
        word2vec_embedding = embed(tokens)
        encoded_reviews.append(word2vec_embedding)
    return encoded_reviews

def get_padded_encoded_reviews(encoded_reviews):
    #Pad the reviews
    padded_reviews_encoding = []
    
    for enc_review in encoded_reviews:
        #Calculate the number of zeros to pad the review with
        zero_padding_cnt = max_length - enc_review.shape[0]
        pad = np.zeros((1, 500))
        #Iterate over the number of zeros to pad
        for i in range(zero_padding_cnt):
            #Connect the pad array and the encoded review along the 0th axis
            enc_review = np.concatenate((pad, enc_review), axis=0)
        padded_reviews_encoding.append(enc_review)
    return padded_reviews_encoding


def label_encode(label):
    # Encode the label as a one-hot encoding
    encoded_label = np.where(label == 1, [0, 1], [1, 0])
    return encoded_label

In [22]:
#Encode the reviews
x_train_padded_encoded = get_padded_encoded_reviews(get_word2vec_enc(x_train))
x_test_padded_encoded = get_padded_encoded_reviews(get_word2vec_enc(x_test))

In [23]:
#Encode the labels
encoded_train_label = [label_encode(label) for label in y_train]
encoded_test_label = [label_encode(label) for label in y_test]

In [26]:
#Turn values into numpy arrays so can be used in model
train_data = np.array(x_train_padded_encoded)
train_label = np.array(encoded_train_label)

test_data = np.array(x_test_padded_encoded)
test_label = np.array(encoded_test_label)
#Save locally
np.save('train_data', train_data)
np.save('train_label', train_label)
np.save('test_data', test_data)
np.save('test_label', test_label)

In [28]:
train_data_mini = train_data[0:1000]
train_label_mini = train_label[0:1000]

In [30]:
print(train_data_mini.shape)
print(train_label_mini.shape)

(1000, 871, 500)
(1000, 2)


In [27]:
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.layers import Bidirectional

In [29]:
# Define the model's architecture
def build_model():
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Conv1D(128, (9),activation='relu'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.MaxPooling1D((2)))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Conv1D(128, (7),activation='relu'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.MaxPooling1D((2)))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Conv1D(128, (5),activation='relu'))
  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(2, activation='sigmoid'))
  return model

model = build_model()

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [31]:
train_data = train_data.reshape(train_data.shape[0], 871, 500)
history = model.fit(
    train_data, 
    train_label, 
    epochs = 10,
    batch_size = 32,
    validation_split=0.3,
) 

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.