### 라이브러리

In [1]:
import os
import pandas as pd
from tqdm import trange
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

### 데이터 확인

In [2]:
data = pd.read_csv('./queen_notshort.csv', lineterminator='\n', low_memory=False)
data.head(3)

Unnamed: 0,index,id,conversation_id,created_at,date,time,timezone,user_id,username,name,...,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,tweet_len
0,0,1568167503947235331,1568167503947235331,2022-09-09 14:50:42 India Standard Time,2022-09-09,14:50:42,530,963099618547589121,inpd_,In Professional Development,...,0,0,[],[],https://twitter.com/INPD_/status/1568167503947...,False,,1,https://pbs.twimg.com/media/FcM_yTPWQAAqAck.jpg,244
1,1,1568167501334253568,1568167501334253568,2022-09-09 14:50:42 India Standard Time,2022-09-09,14:50:42,530,26475981,ukpostbox,UK Postbox,...,0,0,[],[],https://twitter.com/UKPostbox/status/156816750...,False,,1,https://pbs.twimg.com/media/FcM_7H_XgAEDifo.jpg,219
2,2,1568167500134731776,1568167500134731776,2022-09-09 14:50:41 India Standard Time,2022-09-09,14:50:41,530,868028006610153472,brandminds,BRAND MINDS,...,0,0,"['queenelizabeth', 'worldchanger', 'leadership']",[],https://twitter.com/brandminds/status/15681675...,False,,1,https://pbs.twimg.com/media/FcNABxgWIAAyuK2.jpg,171


### 트윗 텍스트 감성 분석

In [3]:
# 감성 분석 모델 로딩
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [4]:
def sentiment_analysis(text):
    encoded_tweet = tokenizer(text, return_tensors='pt')
    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    labels = ['Negative', 'Neutral', 'Positive']
    scores = [(scores[i], labels[i]) for i in range(len(scores))]
    return scores

def insert_sentiment_column(data):
    sentiment_column = []
    for i in trange(len(data)):
        text = data.iloc[i]['tweet']
        try: sentiment = sentiment_analysis(text)
        except: sentiment = ['예외']
        sentiment_column.append(sentiment)

    data['sentiment'] = sentiment_column
    return data

In [5]:
# 트윗 텍스트 감성 분석 예시
print('텍스트')
print(data.loc[2]['tweet'])
print('\n')

print('감성 분석 후')
print(sentiment_analysis(data.loc[2]['tweet']))

텍스트
"When life seems hard, the courageous do not lie down and accept defeat; instead, they are all the more determined to struggle for a better future."   - Queen Elizabeth II


감성 분석 후
[(0.01912036, 'Negative'), (0.2795863, 'Neutral'), (0.70129335, 'Positive')]


In [6]:
# 전체 데이터에서 트윗 텍스트 감성 분석
data = insert_sentiment_column(data)
data.to_csv('./queen_sentiment.csv', index=False, encoding='utf-8-sig')

100%|████████████████████████████████████████████████████████████████████████| 142348/142348 [4:37:36<00:00,  8.55it/s]
