In [1]:
import pandas as pd
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset("jahjinx/IMDb_movie_reviews")
model_name = "SamLowe/roberta-base-go_emotions"
classifier = pipeline(task="text-classification", model=model_name, top_k=None)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Device set to use cuda:0


In [3]:
train_data = data['train']['text']
test_data = data['test']['text']

In [4]:
def preprocess_text(text, max_length=512):
    tokens = tokenizer(text, truncation=True, max_length=max_length, return_tensors="pt")
    truncated_text = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
    return truncated_text

In [5]:
def process_review(text):
    preprocessed_text = preprocess_text(text)
    outputs = classifier(preprocessed_text)
    return outputs

In [6]:
train_res_data = []

for review in tqdm(train_data, desc="Processing reviews"):
    train_res_data.append(process_review(review))

processed_df = pd.DataFrame({
    "review": train_data,
    "emotions": train_res_data
})

print(processed_df.head())


Processing reviews:   0%|          | 7/36000 [00:01<1:43:00,  5.82it/s] You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing reviews: 100%|██████████| 36000/36000 [08:23<00:00, 71.52it/s]

                                              review  \
0  Beautifully photographed and ably acted, gener...   
1  Well, where to start describing this celluloid...   
2  I first caught the movie on its first run on H...   
3  I love Umberto Lenzi's cop movies -- ROME ARME...   
4  I generally won't review movies I haven't seen...   

                                            emotions  
0  [[{'label': 'admiration', 'score': 0.458035469...  
1  [[{'label': 'curiosity', 'score': 0.6023563146...  
2  [[{'label': 'surprise', 'score': 0.68916881084...  
3  [[{'label': 'disappointment', 'score': 0.27864...  
4  [[{'label': 'amusement', 'score': 0.4213815927...  





In [7]:
test_res_data= []

for review in tqdm(test_data, desc="Processing test reviews"):
    test_res_data.append(process_review(review))

processed_test_df = pd.DataFrame({
    "review": test_data,
    "emotions": test_res_data
})


print(processed_test_df.head())


Processing test reviews: 100%|██████████| 10000/10000 [02:20<00:00, 71.09it/s]

                                              review  \
0  With No Dead Heroes you get stupid lines like ...   
1  I thought maybe... maybe this could be good. A...   
2  An elite American military team which of cours...   
3  Ridiculous horror film about a wealthy man (Jo...   
4  Well, if you are one of those Katana's film-nu...   

                                            emotions  
0  [[{'label': 'neutral', 'score': 0.400691390037...  
1  [[{'label': 'disapproval', 'score': 0.59797060...  
2  [[{'label': 'disapproval', 'score': 0.51812630...  
3  [[{'label': 'disappointment', 'score': 0.28626...  
4  [[{'label': 'admiration', 'score': 0.634632229...  



