# Encoding

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [14]:
os.getcwd()

'/content'

In [15]:
train = pd.read_csv('/content/drive/MyDrive/sentiment_analysis/train.csv')
train.head()

Unnamed: 0,id,text,rating
0,0,This was the first televised episode of the Co...,7
1,1,"Jim Carrey is good as usual, and even though t...",10
2,2,"I saw ""A Page of Madness"" in a silent film cou...",10
3,3,A very close and sharp discription of the bubb...,10
4,4,Terry Gilliam's and David Peoples' teamed up t...,10


## Transformers

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

For tokenization and encoding I'll use RoBERTa

In [18]:
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [19]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [20]:
model = model.to(device)

In [21]:
output = model(**tokenizer(train.text[0], return_tensors='pt', truncation=True, max_length=512).to(device))
output

SequenceClassifierOutput(loss=None, logits=tensor([[-0.5318,  0.6192, -0.0440]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Func roberta transforms text into scores (['neg', 'neu', 'pos'])

In [27]:
def roberta(text):
    encoded = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device)
    output = model(**encoded)
    scores = output[0][0].detach().cpu().numpy()
    return scores

In [28]:
train['scores'] = train['text'].apply(roberta)
train.head()

Unnamed: 0,id,text,rating,scores
0,0,This was the first televised episode of the Co...,7,"[-0.53184545, 0.6192214, -0.0440272]"
1,1,"Jim Carrey is good as usual, and even though t...",10,"[-1.1195484, -0.347246, 1.9182004]"
2,2,"I saw ""A Page of Madness"" in a silent film cou...",10,"[-0.12421866, 0.33715022, -0.20795557]"
3,3,A very close and sharp discription of the bubb...,10,"[-2.5340319, 0.12851578, 2.7396328]"
4,4,Terry Gilliam's and David Peoples' teamed up t...,10,"[-1.682061, 0.4536087, 1.4423971]"


In [30]:
train_ = train.copy()

In [31]:
train_.head()

Unnamed: 0,id,text,rating,scores
0,0,This was the first televised episode of the Co...,7,"[-0.53184545, 0.6192214, -0.0440272]"
1,1,"Jim Carrey is good as usual, and even though t...",10,"[-1.1195484, -0.347246, 1.9182004]"
2,2,"I saw ""A Page of Madness"" in a silent film cou...",10,"[-0.12421866, 0.33715022, -0.20795557]"
3,3,A very close and sharp discription of the bubb...,10,"[-2.5340319, 0.12851578, 2.7396328]"
4,4,Terry Gilliam's and David Peoples' teamed up t...,10,"[-1.682061, 0.4536087, 1.4423971]"


In [32]:
train_['neg'] = train_['scores'].apply(lambda x: x[0])
train_['neu'] = train_['scores'].apply(lambda x: x[1])
train_['pos'] = train_['scores'].apply(lambda x: x[2])


train_.head()

Unnamed: 0,id,text,rating,scores,neg,neu,pos
0,0,This was the first televised episode of the Co...,7,"[-0.53184545, 0.6192214, -0.0440272]",-0.531845,0.619221,-0.044027
1,1,"Jim Carrey is good as usual, and even though t...",10,"[-1.1195484, -0.347246, 1.9182004]",-1.119548,-0.347246,1.9182
2,2,"I saw ""A Page of Madness"" in a silent film cou...",10,"[-0.12421866, 0.33715022, -0.20795557]",-0.124219,0.33715,-0.207956
3,3,A very close and sharp discription of the bubb...,10,"[-2.5340319, 0.12851578, 2.7396328]",-2.534032,0.128516,2.739633
4,4,Terry Gilliam's and David Peoples' teamed up t...,10,"[-1.682061, 0.4536087, 1.4423971]",-1.682061,0.453609,1.442397


In [33]:
train_.drop('scores', inplace=True, axis=1)
train_.drop(['text', 'id'], inplace=True, axis=1)

Saving files

In [36]:
train_.to_csv('/content/drive/MyDrive/sentiment_analysis/roberta_train.csv', index=False)

In [37]:
test = pd.read_csv('/content/drive/MyDrive/sentiment_analysis/test.csv')
test['scores'] = test['text'].apply(roberta)
test['neg'] = test['scores'].apply(lambda x: x[0])
test['neu'] = test['scores'].apply(lambda x: x[1])
test['pos'] = test['scores'].apply(lambda x: x[2])
test.head()

Unnamed: 0,id,text,rating,scores,neg,neu,pos
0,2,The final installment sees Sho Aikawa and Riki...,7,"[-1.0409685, 0.7310924, 0.32952094]",-1.040969,0.731092,0.329521
1,3,Homicide: The Movie proved to be a good wrap-u...,10,"[-1.7990221, -0.14866282, 2.2115705]",-1.799022,-0.148663,2.211571
2,4,As a father of four in his forties I thought t...,7,"[-1.1740414, 0.24264841, 1.0934855]",-1.174041,0.242648,1.093485
3,5,A wonderful movie about people. I first saw Fo...,10,"[-2.4649632, -0.27782518, 3.324916]",-2.464963,-0.277825,3.324916
4,6,"Until today, I thought there only three people...",9,"[0.02642145, 0.43213725, -0.47955355]",0.026421,0.432137,-0.479554


In [38]:
test.drop(['id', 'text', 'scores'], axis=1, inplace=True)
test.head()

Unnamed: 0,rating,neg,neu,pos
0,7,-1.040969,0.731092,0.329521
1,10,-1.799022,-0.148663,2.211571
2,7,-1.174041,0.242648,1.093485
3,10,-2.464963,-0.277825,3.324916
4,9,0.026421,0.432137,-0.479554


In [39]:
test.to_csv('/content/drive/MyDrive/sentiment_analysis/roberta_test.csv', index=False)