In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [2]:
from transformers import pipeline

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)



In [3]:
# Shorten token length
def truncate_string(string):
    if len(string) <= 200:
        return string
    else:
        return string[:200]

# Use a weighted sum approach to get compound sentiment score
def get_compound_score(string):

    response = distilled_student_sentiment_classifier(truncate_string(string))
    
    # Define weights for sentiment labels
    weights = {'positive': 1, 'neutral': 0, 'negative': -1}
    
    # Initialize variables for summing scores
    total_score = 0
    total_weight = 0
    compound_score = 0
    
    # Iterate through each sentiment label and score
    for sentiment in response[0]:
        label = sentiment['label']
        score = sentiment['score']
        weight = weights[label]
        total_score += score * weight
        total_weight += weight
    
    # Calculate the compound score
    if total_weight == 0:
        return total_score
    else:
        compound_score = total_score / total_weight
        return compound_score

In [4]:
print(get_compound_score('私は音楽が好きです'))

0.8893415778875351


In [None]:
# Deprecated approach (translate and then assign sentiment score

# from translate import Translator

# def translate_and_analyze_sentiment(string):

#     translator = Translator(from_lang = "ja", to_lang="en")
#     translation = translator.translate(truncate_string(string))
    
#     if isinstance(translation, str):  # Check if the text is a string
#         vs = analyzer.polarity_scores(translation)
#         return vs['compound']
#     else:
#         return 0.0

In [5]:
import pandas as pd
from tqdm import tqdm

file_path = './songs_lyrics.csv'
df = pd.read_csv(file_path, encoding='utf-8')
print(str(len(df)) + ' rows')

tqdm.pandas()
df['sentiment_score'] = df['lyrics'].progress_apply(get_compound_score)
df.head()

16650 rows


  0%|                                                                                         | 0/16650 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|█████████████████████████████████████████████████████████████████████████████| 16650/16650 [06:38<00:00, 41.82it/s]


Unnamed: 0,id,artist,song,lyrics,img_src,sentiment_score
0,0,米津玄師,Lemon,夢ならばどれほどよかったでしょう未だにあなたのことを夢にみる忘れた物を取りに帰るように古びた...,https://m.media-amazon.com/images/I/51ZsVIMARh...,-0.669566
1,1,back number,クリスマスソング,どこかで鐘が鳴ってらしくない言葉が浮かんで寒さが心地よくてあれ　なんで恋なんかしてんだろう聖...,https://m.media-amazon.com/images/I/31eRU7YYby...,-0.066641
2,2,GReeeeN,キセキ,明日、今日よりも好きになれる　溢れる想いが止まらない今もこんなに好きでいるのに　言葉に出来な...,https://m.media-amazon.com/images/I/51L0WT553N...,0.651807
3,3,back number,花束,どう思う？これから2人でやっていけると思う？んんどうかなぁでもとりあえずは一緒にいたいと思っ...,https://m.media-amazon.com/images/I/51SHzjh0dC...,-0.711921
4,4,RADWIMPS,前前前世 (movie ver.),やっと眼を覚ましたかい　それなのになぜ眼も合わせやしないんだい？「遅いよ」と怒る君　これでも...,https://m.media-amazon.com/images/I/51h10DfD1o...,-0.626399


In [6]:
file_path = 'songs_with_sentiment_score.csv'
df.to_csv(file_path, index=False, encoding='utf-8')