In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
import zipfile

file_path = "/content/drive/MyDrive/Deep_Learning/PyTorch_Vol4/Model_NLP/trainingandtestdata.zip"
zip_file = zipfile.ZipFile(file_path,mode = "r")
zip_file.extractall("./")
zip_file.close()

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
try:
    %tensorflow_version 2.x
except:
    pass

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Data Preprocessing

In [118]:
cols = ["sentiment","id","date","query","user","text"]

train_data = pd.read_csv(
    "/content/training.1600000.processed.noemoticon.csv",
    header = None,
    names = cols,
    engine = "python",
    encoding = "latin1"
)

test_data = pd.read_csv(
    "/content/testdata.manual.2009.06.14.csv",
    header = None,
    names = cols,
    engine = "python",
    encoding = "latin1"
)

In [127]:
train_4 = train_data[train_data["sentiment"] == 4]
train_1 = train_data[train_data["sentiment"] == 0]

train_4 = train_4.iloc[:train_4.shape[0]//1000,:]
train_1 = train_1.iloc[:train_1.shape[0]//1000,:]

data = train_4.append(train_1)
data = data.reset_index(drop = True)
data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,4,1467822272,Mon Apr 06 22:22:45 PDT 2009,NO_QUERY,ersle,I LOVE @Health4UandPets u guys r the best!!
1,4,1467822273,Mon Apr 06 22:22:45 PDT 2009,NO_QUERY,becca210,im meeting up with one of my besties tonight! ...
2,4,1467822283,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,Wingman29,"@DaRealSunisaKim Thanks for the Twitter add, S..."
3,4,1467822287,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,katarinka,Being sick can be really cheap when it hurts t...
4,4,1467822293,Mon Apr 06 22:22:46 PDT 2009,NO_QUERY,_EmilyYoung,@LovesBrooklyn2 he has that effect on everyone


In [62]:
data.sentiment.value_counts()

4    800
0    800
Name: sentiment, dtype: int64

In [63]:
data.drop(["id","date","query","user"],
            axis = 1,
            inplace = True)

In [64]:
data.head(1)

Unnamed: 0,sentiment,text
800000,4,I LOVE @Health4UandPets u guys r the best!!


In [65]:
def clean(text):
    text = BeautifulSoup(text,"lxml").get_text()
    text = re.sub(r"@[A-Za-z0-9]+", " ",text)
    text = re.sub(r"https?:://[A-Za-z0-9./]+"," ",text)
    text = re.sub(r"[^a-zA-Z.!?']",' ',text)
    text = re.sub(r" +"," ",text)
    return text

In [66]:
data_clean = [clean(text) for text in data.text]
data_clean[0]

'I LOVE u guys r the best!! '

In [67]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

set(data_labels)

{0, 1}

In [68]:
len(data_clean),len(data_labels)

(1600, 1600)

In [82]:
# Tokenizer 
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data_clean)
data_inputs = tokenizer.texts_to_sequences(data_clean)
data_inputs[0],len(data_inputs)

([1, 40, 65, 225, 204, 3, 168], 1600)

In [84]:
# Padding
MAX_LEN = max([len(sentence) for sentence in data_inputs])

# Add extra
MAX_LEN +=10
data_pad = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,maxlen = MAX_LEN,padding = "post")
data_pad[0]

array([  1,  40,  65, 225, 204,   3, 168,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0], dtype=int32)

In [276]:
# Spliting into training/testing set
test_idx = np.random.randint(0,len(data_pad) // 2 ,int(len(data_pad)*0.20))

# We add half of data length because we select 0 and 1 value equal pair , len(data_pad) // 2 -- > 800
test_idx = np.concatenate((test_idx,test_idx+len(data_pad) // 2))

# Select Max Number I want to see . Because This value not is greater than len(data_pad)
print("Max Number Index : ", max(test_idx))

Max Number Index :  1599


In [278]:
# Test Data Length
print("Test Data Length : " ,len(test_idx))
print("Train Data Length : ", len(data_pad) - len(test_idx))

Test Data Length :  640
Train Data Length :  960


In [218]:
test_inputs = data_pad[test_idx]
test_labels = data_labels[test_idx]

train_inputs = np.delete(data_pad,test_idx,axis = 0)
train_labels = np.delete(data_labels,test_idx)

In [221]:
# Model

class DCNN(tf.keras.Model):

    def __init__(self,
                 vocab_size,
                 emb_dim = 128,
                 nb_filters = 50,
                 FFN_units = 512,
                 nb_classes = 2,
                 dropout_rate = 0.1,
                 training = False,
                 name = "dcnn"):
        super(DCNN,self).__init__(name = name)

        self.embedding = tf.keras.layers.Embedding(vocab_size,
                                                   emb_dim)
        
        self.bigram = tf.keras.layers.Conv1D(filters=nb_filters,
                                             kernel_size = 2,
                                             padding = "valid",
                                             activation = "relu")
        self.pool_1 = tf.keras.layers.GlobalMaxPool1D()

        self.trigram = tf.keras.layers.Conv1D(filters=nb_filters,
                                             kernel_size = 3,
                                             padding = "valid",
                                             activation = "relu")
        self.pool_2 = tf.keras.layers.GlobalMaxPool1D() 

        self.fourgram = tf.keras.layers.Conv1D(filters=nb_filters,
                                             kernel_size = 4,
                                             padding = "valid",
                                             activation = "relu")
        self.pool_3 = tf.keras.layers.GlobalMaxPool1D() 

        self.dense_1 = tf.keras.layers.Dense(units = FFN_units , activation = "relu")
        self.dropout = tf.keras.layers.Dropout(rate = dropout_rate)

        if nb_classes == 2:
            self.last_dense = tf.keras.layers.Dense(units = 1,activation = "sigmoid")
        else:
            self.last_dense = tf.keras.layers.Dense(units = nb_classes,
                                                    activation = "softmax")
            
    
    def call(self,inputs,training):

        x = self.embedding(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)

        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)

        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)

        merged = tf.concat([x_1,x_2,x_3],axis = -1) # (batch_size,3*nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged,training)
        output = self.last_dense(merged)

        return output       

In [295]:
# Config

VOCAB_SIZE = len(set(tokenizer.word_counts)) + 1
print("vocab_size : " , VOCAB_SIZE)

EMD_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 5

vocab_size :  4284


In [296]:
# Training
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMD_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE,
            training = False,
            name = "dcnn")

In [303]:
if NB_CLASSES == 2:
    Dcnn.compile(loss = "binary_crossentropy",
                 metrics = [tf.keras.metrics.Accuracy()],
                 optimizer = tf.keras.optimizers.Adam(lr = 1e-4))
else:
    Dcnn.compile(loss = "sparse_categorical_crossentropy",
                 metrics = [tf.keras.metrics.SparseCategoricalAccuracy()],
                 optimizer = tf.keras.optimizers.Adam(lr = 1e-4))

  super(Adam, self).__init__(name, **kwargs)


In [298]:
# Her bir kelimenin karşılığı 200 vector denk geliyor !!!!
x = Dcnn.embedding(train_inputs)[:0]

bigram = Dcnn.bigram(x)
bigram_p1 = Dcnn.pool_1(bigram)

trigram = Dcnn.trigram(x)
trigram_p2 = Dcnn.pool_2(trigram)

fourgram = Dcnn.fourgram(x)
fourgram_p3 = Dcnn.pool_3(fourgram)

concat = tf.concat([bigram_p1,trigram_p2,fourgram_p3],axis = -1)

dense = Dcnn.dense_1(concat)
last_dense = Dcnn.last_dense(Dcnn.dropout(dense))


print("X Shape : ",x.shape)
print()
print("bigram Shape : ",bigram.shape)
print("bigram_p1 Shape : ",bigram_p1.shape)
print()
print("trigram Shape : ",trigram.shape)
print("trigram_p2 Shape : ",trigram_p2.shape)
print()
print("fourgram Shape : ",fourgram.shape)
print("fourgram_p3 Shape : ",fourgram_p3.shape)
print()

# Her bir vectorun karşılık gelen uzunluğundan max bulup devam ediyor !!
print("Concat Shape : ", concat.shape)
print()
print("Dense Shape : ", dense.shape)
print()
print("Last Dense Shape : ", last_dense.shape)

X Shape :  (0, 41, 200)

bigram Shape :  (0, 40, 100)
bigram_p1 Shape :  (0, 100)

trigram Shape :  (0, 39, 100)
trigram_p2 Shape :  (0, 100)

fourgram Shape :  (0, 38, 100)
fourgram_p3 Shape :  (0, 100)

Concat Shape :  (0, 300)

Dense Shape :  (0, 256)

Last Dense Shape :  (0, 1)


In [299]:
checkpoint_path = "./"
ckpt = tf.train.Checkpoint(Dcnn = Dcnn)
ckpt_manager= tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep = 1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!")

In [304]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size = BATCH_SIZE,
         epochs = NB_EPOCHS
         )
ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'./ckpt-1'

In [307]:
# Evaluation 

results = Dcnn.evaluate(test_inputs,test_labels,batch_size = BATCH_SIZE)
print(results)

[0.6832962036132812, 0.0]


In [332]:
def predict(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = tf.keras.preprocessing.sequence.pad_sequences(seq,maxlen=MAX_LEN,padding = "post")

    return Dcnn(pad,training = False).numpy()[0][0]

predict("You are so fucking bad this job")
predict("You are so funny")

0.517354