In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df_fashion = pd.read_csv('generated_reviews_fashion.csv')

In [3]:
len(df_fashion.index)

46

In [3]:
df_fashion.head()

Unnamed: 0,input_text,target_text,generated_text
0,cushioning | husband | insert | bone | plantar...,Pinnacle seems to have more cushioning so my h...,I have been using cushioning for the last 10 y...
1,price | product | size | chart | order | buy |...,"great price for the product, though the sizes ...",I was excited to find this product at such a c...
2,fit | wash | water | shrink | inch | inseam | ...,Good fit even after washing in hot water to fo...,I have a very nice Fit Wash that I can order a...
3,shoe | comfort | performance | quality | time ...,Great shoe. I've had Nike's before and have al...,I have been using these for over 30 years and ...
4,bit | picture | light | way | box | paper | fi...,It looks a bit nicer on the picture. Its very ...,I like this bit of a picture of the light way ...


In [4]:
df_fashion['target_text'][1]

"great price for the product, though the sizes tend to be bigger (based on mens size i think).  there wasn't a size chart to refer to when i was ordering, so i ended up buying two, each at a difference size."

In [5]:
df_software = pd.read_csv('generated_reviews_software.csv', names=['input_text','target_text','generated_text'])

In [6]:
len(df_software.index)

880

In [7]:
df_software.head()

Unnamed: 0,input_text,target_text,generated_text
0,version | software | pay | middle | update | f...,I just recently converted to this version from...,I have been using this version because I didn'...
1,tech | support | fact | method | product | res...,If you have any problems you will not be able ...,I really love using these products. I didn't r...
2,look | hood | information | engine | user | in...,"Because, while I'm not about to go looking und...",I was looking for a product that will help min...
3,desktop | sound | video | editing | software |...,Corel VideoStudio Ultimate X8 installed on my ...,I have used Desktop for years and loved it.......
4,experience | accounting | product | version | ...,My last experience with a Peachtree accounting...,I have had experience with accounting products...


In [8]:
df_appliances = pd.read_csv('generated_reviews_appliances.csv', names=['input_text','target_text','generated_text'])

In [9]:
len(df_appliances.index)

52

In [10]:
df = df_fashion.append(df_appliances, ignore_index=True)

In [13]:
df['target_text'][1]

"great price for the product, though the sizes tend to be bigger (based on mens size i think).  there wasn't a size chart to refer to when i was ordering, so i ended up buying two, each at a difference size."

In [11]:
df = df.append(df_software, ignore_index=True)

In [12]:
df.head()

Unnamed: 0,input_text,target_text,generated_text
0,cushioning | husband | insert | bone | plantar...,Pinnacle seems to have more cushioning so my h...,I have been using cushioning for the last 10 y...
1,price | product | size | chart | order | buy |...,"great price for the product, though the sizes ...",I was excited to find this product at such a c...
2,fit | wash | water | shrink | inch | inseam | ...,Good fit even after washing in hot water to fo...,I have a very nice Fit Wash that I can order a...
3,shoe | comfort | performance | quality | time ...,Great shoe. I've had Nike's before and have al...,I have been using these for over 30 years and ...
4,bit | picture | light | way | box | paper | fi...,It looks a bit nicer on the picture. Its very ...,I like this bit of a picture of the light way ...


In [35]:
# df = pd.concat([df_fashion, df_appliances, df_software], axis=0, ignore_index=True)

In [38]:
# df.head()

In [14]:
len(df.index)

978

In [15]:
df['target_text'][1]

"great price for the product, though the sizes tend to be bigger (based on mens size i think).  there wasn't a size chart to refer to when i was ordering, so i ended up buying two, each at a difference size."

In [16]:
df['generated_text'][1]

"I was excited to find this product at such a cheap price, I couldn't live without it! I have a bigger chart on the product page, and I can see why the heck it is doing is to make it more than"

In [None]:
df['input_text'][1]

'price | product | size | chart | order | buy | difference'

In [17]:
import nltk
nltk.download('stopwords')
import re
import string
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def clean_text(tweet:str) -> str:
    """
    Convert all text to lowercase, remove stock market tickers, RT symbol, hyperlinks and the hastag symbol
    :param tweet: tweet by a unique user
    :return: cleaned string without hashtags, emojis, and punctuation
    """
    # make text lower case
    tweet = tweet.lower()
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', str(tweet))
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', str(tweet))
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet))
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', str(tweet))
    
    # remove punctuation
    punct = set(string.punctuation)
    tweet = "".join(ch for ch in tweet if ch not in punct)
    
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tweet = " ".join(word for word in tweet.split() if word not in stop_words)
    
    return tweet

In [19]:
df["generated_text"] = df["generated_text"].apply(clean_text)

In [None]:
df['generated_text'][1]

'excited find product cheap price couldnt live without bigger chart product page see heck make'

In [20]:
df["target_text"] = df["target_text"].apply(clean_text)

In [None]:
df['target_text'][1]

'great price product though sizes tend bigger based mens size think wasnt size chart refer ordering ended buying two difference size'

In [None]:
# Tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
# "Run" Tokenizer
df['target_text_tokens'] = df['target_text'].map(tokenizer.tokenize)
df['generated_text_tokens'] = df['generated_text'].map(tokenizer.tokenize)

In [None]:
df.head()

Unnamed: 0,input_text,target_text,generated_text,target_text_tokens,generated_text_tokens
0,cushioning | husband | insert | bone | plantar...,pinnacle seems cushioning husband likes better...,using cushioning last 10 years first introduce...,"[pinnacle, seems, cushioning, husband, likes, ...","[using, cushioning, last, 10, years, first, in..."
1,price | product | size | chart | order | buy |...,great price product though sizes tend bigger b...,excited find product cheap price couldnt live ...,"[great, price, product, though, sizes, tend, b...","[excited, find, product, cheap, price, couldnt..."
2,fit | wash | water | shrink | inch | inseam | ...,good fit even washing hot water force shrinkin...,nice fit wash order time small inseam much big...,"[good, fit, even, washing, hot, water, force, ...","[nice, fit, wash, order, time, small, inseam, ..."
3,shoe | comfort | performance | quality | time ...,great shoe ive nikes always pleased comfort pe...,using 30 years delighted surprised tried found...,"[great, shoe, ive, nikes, always, pleased, com...","[using, 30, years, delighted, surprised, tried..."
4,bit | picture | light | way | box | paper | fi...,looks bit nicer picture light way held box won...,like bit picture light way pictures show simpl...,"[looks, bit, nicer, picture, light, way, held,...","[like, bit, picture, light, way, pictures, sho..."


In [None]:
# Importing lemmatizer 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
# Instantiating lemmatizer 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
lemmatize_words=[]
for i in range (len(df['target_text_tokens'])):
  word=''
  for j in range(len(df['target_text_tokens'][i])):
    lemm_word=lemmatizer.lemmatize(df['target_text_tokens'][i][j])#lemmatize
    word=word + ' '+lemm_word # joining tokens into sentence    
  lemmatize_words.append(word) # store in list

In [None]:
#creating a new column to store the result
df['target_text_lemmatized']=lemmatize_words

In [None]:
lemmatize_words=[]
for i in range (len(df['generated_text_tokens'])):
  word=''
  for j in range(len(df['generated_text_tokens'][i])):
    lemm_word=lemmatizer.lemmatize(df['generated_text_tokens'][i][j])#lemmatize
    word=word + ' '+lemm_word # joining tokens into sentence    
  lemmatize_words.append(word) # store in list

In [None]:
#creating a new column to store the result
df['generated_text_lemmatized']=lemmatize_words

In [None]:
len(df.index)

46

In [None]:
df.head()

Unnamed: 0,input_text,target_text,generated_text,target_text_tokens,generated_text_tokens,target_text_lemmatized,generated_text_lemmatized
0,cushioning | husband | insert | bone | plantar...,pinnacle seems cushioning husband likes better...,using cushioning last 10 years first introduce...,"[pinnacle, seems, cushioning, husband, likes, ...","[using, cushioning, last, 10, years, first, in...",pinnacle seems cushioning husband like better...,using cushioning last 10 year first introduce...
1,price | product | size | chart | order | buy |...,great price product though sizes tend bigger b...,excited find product cheap price couldnt live ...,"[great, price, product, though, sizes, tend, b...","[excited, find, product, cheap, price, couldnt...",great price product though size tend bigger b...,excited find product cheap price couldnt live...
2,fit | wash | water | shrink | inch | inseam | ...,good fit even washing hot water force shrinkin...,nice fit wash order time small inseam much big...,"[good, fit, even, washing, hot, water, force, ...","[nice, fit, wash, order, time, small, inseam, ...",good fit even washing hot water force shrinki...,nice fit wash order time small inseam much bi...
3,shoe | comfort | performance | quality | time ...,great shoe ive nikes always pleased comfort pe...,using 30 years delighted surprised tried found...,"[great, shoe, ive, nikes, always, pleased, com...","[using, 30, years, delighted, surprised, tried...",great shoe ive nike always pleased comfort pe...,using 30 year delighted surprised tried found...
4,bit | picture | light | way | box | paper | fi...,looks bit nicer picture light way held box won...,like bit picture light way pictures show simpl...,"[looks, bit, nicer, picture, light, way, held,...","[like, bit, picture, light, way, pictures, sho...",look bit nicer picture light way held box won...,like bit picture light way picture show simpl...


In [None]:
#imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
df_new = pd.DataFrame()
column_names = ["input", "output"]

In [None]:
df_new.head()

In [None]:
for i in range (len(df['target_text_lemmatized'])):
  j = len(df_new.index)
  df_new.loc[j, 'input'] = df['target_text_lemmatized'][i]
  df_new.loc[j, 'output'] = int(0)

In [None]:
df_new.head()

Unnamed: 0,input,output
0,pinnacle seems cushioning husband like better...,0.0
1,great price product though size tend bigger b...,0.0
2,good fit even washing hot water force shrinki...,0.0
3,great shoe ive nike always pleased comfort pe...,0.0
4,look bit nicer picture light way held box won...,0.0


In [None]:
for i in range (len(df['generated_text_lemmatized'])):
  j = len(df_new.index)
  df_new.loc[j, 'input'] = df['generated_text_lemmatized'][i]
  df_new.loc[j, 'output'] = int(1)



In [None]:
len(df_new.index)

92

In [None]:
df_new.tail()

Unnamed: 0,input,output
87,like capri comfortable medium size calf muscl...,1.0
88,received ringtones first time couldnt believe...,1.0
89,disappointed avon discontinued product didnt ...,1.0
90,pair pair flat footshoes comfortable dont fee...,1.0
91,fault reading fiber find way fix use fiber pr...,1.0


In [None]:
df_new['output'] = df_new['output'].astype('int')

In [None]:
df_new.head()

Unnamed: 0,input,output
0,pinnacle seems cushioning husband like better...,0
1,great price product though size tend bigger b...,0
2,good fit even washing hot water force shrinki...,0
3,great shoe ive nike always pleased comfort pe...,0
4,look bit nicer picture light way held box won...,0


In [None]:
#defining X and y for the model
X = df_new['input']
y = df_new['output']
# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
y_train.value_counts()

1    35
0    34
Name: output, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', LogisticRegression())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfid', TfidfTransformer()),
                ('model', LogisticRegression())])

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.6521739130434783

In [None]:
metrics.confusion_matrix(y_test, y_pred)

array([[9, 3],
       [5, 6]])

In [21]:
# Roberta model
df_roberta = pd.DataFrame()
column_names = ["input", "output"]


In [22]:
df_roberta.head()

In [23]:
df.head()

Unnamed: 0,input_text,target_text,generated_text
0,cushioning | husband | insert | bone | plantar...,pinnacle seems cushioning husband likes better...,using cushioning last 10 years first introduce...
1,price | product | size | chart | order | buy |...,great price product though sizes tend bigger b...,excited find product cheap price couldnt live ...
2,fit | wash | water | shrink | inch | inseam | ...,good fit even washing hot water force shrinkin...,nice fit wash order time small inseam much big...
3,shoe | comfort | performance | quality | time ...,great shoe ive nikes always pleased comfort pe...,using 30 years delighted surprised tried found...
4,bit | picture | light | way | box | paper | fi...,looks bit nicer picture light way held box won...,like bit picture light way pictures show simpl...


In [24]:
df['input_text'][1]

'price | product | size | chart | order | buy | difference'

In [25]:
for i in range (len(df['target_text'])):
  j = len(df_roberta.index)
  df_roberta.loc[j, 'input'] = df['target_text'][i]
  df_roberta.loc[j, 'output'] = int(0)

In [26]:
for i in range (len(df['generated_text'])):
  j = len(df_roberta.index)
  df_roberta.loc[j, 'input'] = df['generated_text'][i]
  df_roberta.loc[j, 'output'] = int(1)

In [27]:
df_roberta['output'] = df_roberta['output'].astype('int')

In [28]:
df_roberta.head()

Unnamed: 0,input,output
0,pinnacle seems cushioning husband likes better...,0
1,great price product though sizes tend bigger b...,0
2,good fit even washing hot water force shrinkin...,0
3,great shoe ive nikes always pleased comfort pe...,0
4,looks bit nicer picture light way held box won...,0


In [127]:
#defining X and y for the model
X = df_roberta['input']
y = df_roberta['output']
# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
y_train.value_counts()

1    734
0    733
Name: output, dtype: int64

In [53]:
type(X_train)

pandas.core.series.Series

In [128]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


In [129]:
y_test = y_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [131]:
len(y_train)

1467

In [30]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 6.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 76.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [31]:
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

In [35]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [33]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [36]:
# First, we check the length of the longest tokenized sentence by roberta tokenizer:


token_lens = []

for txt in X_train:
  tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
  token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

512

In [132]:
MAX_LEN=128

In [133]:
def tokenize_roberta(data,max_len=MAX_LEN) :
  input_ids = []
  attention_masks = []
  for i in range(len(data)):
    encoded = tokenizer_roberta.encode_plus(data[i],add_special_tokens=True,max_length=128,pad_to_max_length = True,truncation=True,return_attention_mask=True)
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
    roberta_input_ids = tf.convert_to_tensor(input_ids)
    roberta_attention_masks = tf.convert_to_tensor(attention_masks)
  return roberta_input_ids, roberta_attention_masks

In [134]:
train_input_ids, train_attention_masks = tokenize_roberta(X_train, 128)
# val_input_ids, val_attention_masks = tokenize_roberta(X_valid, MAX_LEN)
test_input_ids, test_attention_masks = tokenize_roberta(X_test, 128)



In [135]:
train_input_ids

<tf.Tensor: shape=(1467, 128), dtype=int32, numpy=
array([[    0,  9502, 15441, ...,     1,     1,     1],
       [    0, 43395,   235, ...,  2333,  2594,     2],
       [    0,  5881,  1181, ...,     1,     1,     1],
       ...,
       [    0, 20982,  2579, ...,     1,     1,     1],
       [    0,  2088,   667, ...,   770,  1296,     2],
       [    0,   428, 12807, ...,     1,     1,     1]], dtype=int32)>

In [168]:
def create_model(bert_model, max_len=MAX_LEN):
    
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.BinaryCrossentropy()
    accuracy = tf.keras.metrics.BinaryAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics=accuracy)
    return model

In [149]:
# def create_model_1(bert_model, max_len=MAX_LEN):
#   opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-7)
#   loss = tf.keras.losses.BinaryCrossentropy()
#   accuracy = tf.keras.metrics.BinaryAccuracy()

#   input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
#   attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
#   x = bert_model(input_ids,attention_masks)
    
#   x1 = tf.keras.layers.Dropout(0.1)(x[0])
#   x1 = tf.keras.layers.Conv1D(128, 2, padding='same')(x1)
#   x1 = tf.keras.layers.ReLU()(x1)
#   x1 = tf.keras.layers.Conv1D(64, 2, padding='same')(x1)
#   x1 = tf.keras.layers.Dense(1)(x1)
#   x1 = tf.keras.layers.Flatten()(x1)
#   x1 = tf.keras.layers.Activation('softmax')(x1)
    
#   x2 = tf.keras.layers.Dropout(0.1)(x[0])
#   x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
#   x2 = tf.keras.layers.ReLU()(x2)
#   x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
#   x2 = tf.keras.layers.Dense(1)(x2)
#   x2 = tf.keras.layers.Flatten()(x2)
#   x2 = tf.keras.layers.Activation('softmax')(x2)
#   model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = [x1, x2])
#   model.compile(opt, loss=loss, metrics=accuracy)
  # return model

In [169]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [171]:
model = create_model(roberta_model, 128)
model.summary()

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_39 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 input_40 (InputLayer)          [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_model_3 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_39[0][0]',               
 odel)                          thPoolingAndCrossAt               'input_40[0][0]']               
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                        

In [124]:
# from sklearn import preprocessing
# ohe = preprocessing.OneHotEncoder()
# y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
# # y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
# y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [100]:
# type(y_train)

numpy.ndarray

In [69]:
# # y_train = np.asarray(y_train).astype('float32')
# # y_test = np.asarray(y_test).astype('float32')
# y_train = np.array([np.array(val) for val in y_train])
# y_test = np.array([np.array(val) for val in y_test])

In [138]:
type(train_attention_masks)

tensorflow.python.framework.ops.EagerTensor

In [154]:
history_2 = model.fit([train_input_ids,train_attention_masks], y_train, epochs=4, batch_size=30)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [155]:
result_roberta = model.predict([test_input_ids,test_attention_masks])

In [160]:
result_roberta

array([[0.33197898, 0.33066154, 0.33735952],
       [0.3284555 , 0.3308874 , 0.34065706],
       [0.32942405, 0.33277822, 0.33779776],
       ...,
       [0.33142993, 0.333027  , 0.3355431 ],
       [0.33124235, 0.33431697, 0.33444068],
       [0.33523467, 0.32393187, 0.3408335 ]], dtype=float32)

In [156]:
y_pred_roberta =  np.zeros_like(result_roberta)
y_pred_roberta[np.arange(len(y_pred_roberta)), result_roberta.argmax(1)] = 1

In [159]:
y_pred_roberta

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [172]:
# conf_matrix(y_test.argmax(1),y_pred_roberta.argmax(1),'RoBERTa Sentiment Analysis\nConfusion Matrix')

In [173]:
# print('\tClassification Report for RoBERTa:\n\n',classification_report(y_test,y_pred_roberta.argmax(1)))