# Final Combined Model

**Description:** Polarity + Stress + Sarcasm

## Mount Shared Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
max_length = 50

#### 5.1 Load Polarity Model

In [None]:
!pip install gensim==3.8.3 --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text==2.8.2 --quiet
!pip install pydot --quiet
!pip install transformers --quiet

[K     |████████████████████████████████| 24.2 MB 98.4 MB/s 
[K     |████████████████████████████████| 4.9 MB 15.3 MB/s 
[K     |████████████████████████████████| 4.7 MB 15.0 MB/s 
[K     |████████████████████████████████| 101 kB 12.3 MB/s 
[K     |████████████████████████████████| 596 kB 70.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 63.1 MB/s 
[?25h

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text


import sklearn as sk
import os
import nltk
from nltk.corpus import reuters
from nltk.data import find

import matplotlib.pyplot as plt
import transformers
from transformers import BertTokenizer, TFBertModel

import re
import gensim
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel

In [None]:
# Set Directory
# '/content/drive/MyDrive/wzx/data/train.csv'
os.chdir('/content/drive/MyDrive/wzx/data')

In [None]:
os.listdir()

['new_balanced_df.csv',
 'train.csv',
 'valid.csv',
 'test.csv',
 'twitter_processed.csv']

## Data

In [None]:
# # Get Data
# d = pd.read_csv('new_balanced_df.csv')

# data = d[['reviewText','rating_label']]
# data.head()

In [None]:
# # delete missing value
# data = data.dropna()

In [None]:
# # Data Distribution (365,608 vs 365,608)
# # from sklearn.model_selection import train_test_split

# xxx, xtest, yyy, ytest = train_test_split(data['reviewText'],data['rating_label'],test_size = 0.2,random_state = 32)
# xtrain,xvalid,ytrain,yvalid = train_test_split(xxx,yyy,test_size = 0.2,random_state = 21)
# # Training & Test Data
# final_train = pd.concat([xtrain,ytrain],axis = 1).reset_index(drop=True)
# final_valid = pd.concat([xvalid,yvalid],axis = 1).reset_index(drop=True)
# final_test = pd.concat([xtest,ytest],axis=1).reset_index(drop=True)

In [None]:
# # # # Save Data
# final_train.to_csv('train.csv',index=False)
# final_valid.to_csv('valid.csv',index=False)
# final_test.to_csv('test.csv',index=False)

---------------------------------------------------------------------------

In [None]:
twitter_data = pd.read_csv('/content/drive/MyDrive/wzx/data/twitter_processed.csv')#.sample(n = 100)
twitter_data.head()

Unnamed: 0,rating_label,reviewText
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
# We have a balanced dataset with 800,000 vs. 800,000
from sklearn.model_selection import train_test_split

xxx, xtest, yyy, ytest = train_test_split(twitter_data['reviewText'],twitter_data['rating_label'],test_size = 0.2,random_state = 32)
xtrain,xvalid,ytrain,yvalid = train_test_split(xxx,yyy,test_size = 0.2,random_state = 21)

# Training & Test Data
train = pd.concat([xtrain,ytrain],axis = 1).reset_index(drop=True)
valid = pd.concat([xvalid,yvalid],axis = 1).reset_index(drop=True)
test = pd.concat([xtest,ytest],axis=1).reset_index(drop=True)

In [None]:
# Read Data
# train = pd.read_csv('train.csv')
# valid = pd.read_csv('valid.csv')
# test = pd.read_csv('test.csv')

In [None]:
train.isnull().sum()

reviewText      0
rating_label    0
dtype: int64

In [None]:
train.head()

Unnamed: 0,reviewText,rating_label
0,Wanna know why I miss Roi being on that stage?...,0
1,Watching Bill Bailey: Tinselworm on 4+1. Hilar...,1
2,Omg i just figured out how to use twitter from...,1
3,english teacher wasn't here....have to wait ti...,0
4,@isacullen its offair the dude must be changi...,0


In [None]:
# Training
train_list = []
train_label = []
for index, row in train.iterrows():
  train_list.append(row['reviewText'])
  train_label.append(row['rating_label'])

# Valid
valid_list = []
valid_label = []
for index, row in valid.iterrows():
  valid_list.append(row['reviewText'])
  valid_label.append(row['rating_label'])

# Testing
test_list = []
test_label = []
for index, row in test.iterrows():
  test_list.append(row['reviewText'])
  test_label.append(row['rating_label'])

print("Number of Training Data:",len(train_label))
print("Number of Valid Data:",len(valid_label))
print("Number of Test Data:",len(test_label))

Number of Training Data: 1024000
Number of Valid Data: 256000
Number of Test Data: 320000


## Model Training

In [None]:
# Data Preprocessing
bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
max_length = 50 # can set to 100

x_train = bert_tokenizer(train_list, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train = train_label

x_valid = bert_tokenizer(valid_list, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_valid = valid_label

x_test = bert_tokenizer(test_list, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_test = test_label

In [None]:
## Load baseline model
def create_baseline_model(train_layers=-1,
                          hidden_size = 100, 
                          dropout=0.3,
                          learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the Pooled Ouutput for classification purposes
    """

    bert_model = TFBertModel.from_pretrained('bert-base-cased')


    #Inputs
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer_baseline')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer_baseline')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer_baseline')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}         

    bert_out = bert_model(bert_inputs)

    # Take average of classification token
    #avg_token = tf.math.reduce_mean(bert_out[0],axis=1)

    pooled_token = bert_out[1]

    #Dense Hidden Layer
    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer_baseline')(pooled_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer_baseline')(hidden)

    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                            metrics='accuracy') 


    return classification_model

In [None]:
baseline_model = create_baseline_model()
baseline_model.summary()

Downloading tf_model.h5:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer_baseline   [(None, 50)]        0           []                               
 (InputLayer)                                                                                     
                                                                                                  
 input_ids_layer_baseline (Inpu  [(None, 50)]        0           []                               
 tLayer)                                                                                          
                                                                                                  
 token_type_ids_layer_baseline   [(None, 50)]        0           []                               
 (InputLayer)                                                                                 

In [None]:
# polarity_model_path = '/content/drive/MyDrive/wzx/polarity_model/yelp_polarity_best_weights.h5'


# polarity_model = create_polarity_model()
# polarity_model.load_weights(polarity_model_path)

In [None]:
### Baseline MODEL TRAINING
# '/content/drive/MyDrive/wzx/data/train.csv'
checkpoint_path = '/content/drive/MyDrive/wzx/baseline/twitter_best_weights.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path ,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

baseline_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], 
                                                  np.array(y_train),   
                                                  validation_data=([x_valid.input_ids, x_valid.token_type_ids, x_valid.attention_mask], np.array(y_valid)),    
                                                  batch_size=64, 
                                                  epochs=5,
                                                  callbacks=[model_checkpoint_callback])



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

In [None]:
score = baseline_model.evaluate([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask], 
                                                  y_test) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])