In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df_fashion = pd.read_csv('generated_reviews_fashion.csv')

In [3]:
len(df_fashion.index)

46

In [4]:
df_fashion.head()

Unnamed: 0,input_text,target_text,generated_text
0,cushioning | husband | insert | bone | plantar...,Pinnacle seems to have more cushioning so my h...,I have been using cushioning for the last 10 y...
1,price | product | size | chart | order | buy |...,"great price for the product, though the sizes ...",I was excited to find this product at such a c...
2,fit | wash | water | shrink | inch | inseam | ...,Good fit even after washing in hot water to fo...,I have a very nice Fit Wash that I can order a...
3,shoe | comfort | performance | quality | time ...,Great shoe. I've had Nike's before and have al...,I have been using these for over 30 years and ...
4,bit | picture | light | way | box | paper | fi...,It looks a bit nicer on the picture. Its very ...,I like this bit of a picture of the light way ...


In [5]:
df_software = pd.read_csv('generated_reviews_software.csv', names=['input_text','target_text','generated_text'])

In [6]:
len(df_software.index)

880

In [7]:
df_software.head()

Unnamed: 0,input_text,target_text,generated_text
0,version | software | pay | middle | update | f...,I just recently converted to this version from...,I have been using this version because I didn'...
1,tech | support | fact | method | product | res...,If you have any problems you will not be able ...,I really love using these products. I didn't r...
2,look | hood | information | engine | user | in...,"Because, while I'm not about to go looking und...",I was looking for a product that will help min...
3,desktop | sound | video | editing | software |...,Corel VideoStudio Ultimate X8 installed on my ...,I have used Desktop for years and loved it.......
4,experience | accounting | product | version | ...,My last experience with a Peachtree accounting...,I have had experience with accounting products...


In [8]:
df_appliances = pd.read_csv('generated_reviews_appliances.csv', names=['input_text','target_text','generated_text'])

In [9]:
len(df_appliances.index)

52

In [10]:
df = df_fashion.append(df_appliances, ignore_index=True)

In [11]:
df = df.append(df_software, ignore_index=True)

In [12]:
df.head()

Unnamed: 0,input_text,target_text,generated_text
0,cushioning | husband | insert | bone | plantar...,Pinnacle seems to have more cushioning so my h...,I have been using cushioning for the last 10 y...
1,price | product | size | chart | order | buy |...,"great price for the product, though the sizes ...",I was excited to find this product at such a c...
2,fit | wash | water | shrink | inch | inseam | ...,Good fit even after washing in hot water to fo...,I have a very nice Fit Wash that I can order a...
3,shoe | comfort | performance | quality | time ...,Great shoe. I've had Nike's before and have al...,I have been using these for over 30 years and ...
4,bit | picture | light | way | box | paper | fi...,It looks a bit nicer on the picture. Its very ...,I like this bit of a picture of the light way ...


In [13]:
len(df.index)

978

In [14]:
import nltk
nltk.download('stopwords')
import re
import string
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
def clean_text(tweet:str) -> str:
    """
    Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol
    :param tweet: tweet by a unique user
    :return: cleaned string without hashtags, emojis, and punctuation
    """
    # make text lower case
    tweet = tweet.lower()
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', str(tweet))
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', str(tweet))
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet))
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', str(tweet))
    
    # remove punctuation
    punct = set(string.punctuation)
    tweet = "".join(ch for ch in tweet if ch not in punct)
    
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tweet = " ".join(word for word in tweet.split() if word not in stop_words)
    
    return tweet

In [16]:
df["generated_text"] = df["generated_text"].apply(clean_text)

In [17]:
df["target_text"] = df["target_text"].apply(clean_text)

In [18]:
# Roberta model
df_bert = pd.DataFrame()
column_names = ["titletext", "label"]


In [19]:
df_bert.head()

In [20]:
df.head()

Unnamed: 0,input_text,target_text,generated_text
0,cushioning | husband | insert | bone | plantar...,pinnacle seems cushioning husband likes better...,using cushioning last 10 years first introduce...
1,price | product | size | chart | order | buy |...,great price product though sizes tend bigger b...,excited find product cheap price couldnt live ...
2,fit | wash | water | shrink | inch | inseam | ...,good fit even washing hot water force shrinkin...,nice fit wash order time small inseam much big...
3,shoe | comfort | performance | quality | time ...,great shoe ive nikes always pleased comfort pe...,using 30 years delighted surprised tried found...
4,bit | picture | light | way | box | paper | fi...,looks bit nicer picture light way held box won...,like bit picture light way pictures show simpl...


In [21]:
df['input_text'][1]

'price | product | size | chart | order | buy | difference'

In [22]:
for i in range (len(df['target_text'])):
  j = len(df_bert.index)
  df_bert.loc[j, 'titletext'] = df['target_text'][i]
  df_bert.loc[j, 'label'] = int(0)

In [23]:
for i in range (len(df['generated_text'])):
  j = len(df_bert.index)
  df_bert.loc[j, 'titletext'] = df['generated_text'][i]
  df_bert.loc[j, 'label'] = int(1)

In [24]:
df_bert['label'] = df_bert['label'].astype('int')

In [25]:
df_bert.head()

Unnamed: 0,titletext,label
0,pinnacle seems cushioning husband likes better...,0
1,great price product though sizes tend bigger b...,0
2,good fit even washing hot water force shrinkin...,0
3,great shoe ive nikes always pleased comfort pe...,0
4,looks bit nicer picture light way held box won...,0


In [26]:
df_bert['label'].value_counts()

0    978
1    978
Name: label, dtype: int64

In [27]:
df_bert.to_csv("reviews.csv")

In [29]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

In [34]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

In [31]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [32]:
def bert_encode(input_text, max_len):
    input_ids = []
    attension_masks = []
    for text in input_text:
        output_dict = tokenizer.encode_plus(
            text, 
            add_special_tokens = True,
            truncation=True,
            max_length = max_len,
            pad_to_max_length = True,
            return_attention_mask = True
        )
        input_ids.append(output_dict['input_ids'])
        attension_masks.append(output_dict['attention_mask'])
    return np.array(input_ids), np.array(attension_masks)

In [35]:
text = df_bert['titletext']
target = df_bert['label']
train_input_ids, train_attention_masks = bert_encode(text, 60)



In [39]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape= (60,), dtype= 'int32')
    attention_masks = tf.keras.Input(shape= (60,), dtype= 'int32')
    
    output = bert_model([input_ids, attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32, activation= 'relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(1, activation= 'sigmoid')(output)
    
    model = tf.keras.models.Model(inputs= [input_ids, attention_masks], outputs= output)
    model.compile(Adam(learning_rate=1e-5), loss= 'binary_crossentropy', metrics= ['accuracy'])
    return model

In [40]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [41]:
model = create_model(bert_model)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 60)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 60)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_3[0][0]',                
                                thPoolingAndCrossAt               'input_4[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 60,                                          

In [42]:
%%time

history = model.fit(
    [train_input_ids, train_attention_masks],
    target, 
    validation_split = 0.2,
    epochs = 3,
    batch_size = 10
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 1min 30s, sys: 9.44 s, total: 1min 40s
Wall time: 1min 26s
