<a href="https://colab.research.google.com/github/workhardzy/K6312/blob/main/testCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import time

from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from keras.callbacks import ModelCheckpoint


import matplotlib.pyplot as plt

In [None]:
path = '/content/gdrive/My Drive/covid_mining/aclImdb/IMDB Dataset.csv' #this is from kaggle
df = pd.read_csv(path,encoding='UTF-8')
df.columns = ['Text','Label']

def recode_label(label):
    if label == 'positive':
        return 1
    else:
        return 0

df['Label'] = df['Label'].apply(lambda x: recode_label(x))
df

df = df[['Text','Label']]
X = df['Text'].fillna('').tolist()
X = [str(i) for i in X]
y = df['Label'].fillna('').tolist()

In [None]:
RANDOM_STATE = 42

# Split train & test
text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=RANDOM_STATE)

# Tokenize and transform to integer index
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_test = tokenizer.texts_to_sequences(text_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
maxlen = max(len(x) for x in X_train) # longest text in train set

# Add pading to ensure all vectors have same dimensionality
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
# Define CNN architecture

embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(filters=128, kernel_size=7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

checkpoint = ModelCheckpoint("/content/gdrive/My Drive/covid_mining/best_model.hdf5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto', period=1)

# Fit model
history = model.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10,
                    callbacks=[checkpoint])
loss, accuracy = model.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2493, 100)         9068200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2487, 128)         89728     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 9,159,229
Trainable params: 9,159,229
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 00001: loss improved from inf to 0.33338, saving model to /content/gdrive/My Drive/covid_

KeyboardInterrupt: ignored

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training accuracy')
    plt.plot(x, val_acc, 'r', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

%matplotlib inline
plot_history(history)

In [None]:
%%time

X_sample = ['this is a sample text']
X_sample = tokenizer.texts_to_sequences(X_sample)
X_sample = pad_sequences(X_sample, padding='post', maxlen=maxlen)

y_sample = model.predict_classes(X_sample).flatten().tolist()

print('Prediction: ',y_sample)
print(model.predict_proba(X_sample).flatten().tolist()
)

In [None]:
X_sample = ['Amazing']
X_sample = tokenizer.texts_to_sequences(X_sample)
X_sample = pad_sequences(X_sample, padding='post', maxlen=maxlen)

y_sample = model.predict_classes(X_sample).flatten().tolist()

print('Prediction: ',y_sample)
print(model.predict_proba(X_sample).flatten().tolist()
)

In [None]:
RANDOM_STATE = 84

# Split train & test
text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=RANDOM_STATE)


X_train = tokenizer.texts_to_sequences(text_train)
X_test = tokenizer.texts_to_sequences(text_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
maxlen = max(len(x) for x in X_train) # longest text in train set

# Add pading to ensure all vectors have same dimensionality
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
def pred_class(text):
    X_sample = [text]
    X_sample = tokenizer.texts_to_sequences(X_sample)
    X_sample = pad_sequences(X_sample, padding='post', maxlen=maxlen)

    y_sample = int(model.predict(X_sample) > 0.5)
    return y_sample


In [None]:
from tqdm.notebook import tqdm
y_pred = [pred_class(x) for x in tqdm(text_test)]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#loading from model

In [None]:
from keras.models import load_model

model_load = load_model('/content/gdrive/My Drive/covid_mining/best_model.hdf5')

In [None]:
model.summary()