In [1]:
import pandas as pd
import numpy as np

In [2]:
original_rating = pd.read_json("Amazon_Fashion.jsonl", lines=True)
original_item = pd.read_json("meta_Amazon_Fashion.jsonl", lines=True)

In [3]:
rating = original_rating.copy()
item = original_item.copy()

In [4]:
rating = rating.rename(columns={'parent_asin': 'item_id'})
item = item.rename(columns={'parent_asin': 'item_id'})

# Item 1차

In [5]:
item = item.drop(columns=['main_category', 'price', 'images', 'videos', 'store', 'categories', 'details', 'bought_together'])

In [6]:
item[item.select_dtypes(include=['object']).columns] = item.select_dtypes(include=['object']).astype(str)
item = item.applymap(lambda x: x.replace('[', '').replace(']', '') if isinstance(x, str) else x)
item = item.fillna('')

In [7]:
item['item_text']=item['title']+item['features']+item['description']

In [8]:
item = item.drop(columns=['title', 'average_rating', 'rating_number', 'features', 'description'])

In [9]:
item = item[item['item_text'] != '']

# Rating 1차

In [10]:
rating = rating.drop(columns=['images', 'verified_purchase', 'helpful_vote', 'timestamp'])

In [11]:
# Generate review column
rating['review']=rating['title']+rating['text']

In [12]:
# drop 'asin'-'parent_asin' unmatching columns
rating = (rating[rating['asin'] == rating['item_id']])
rating = rating.drop(columns=['asin', 'title', 'text'])

In [13]:
# eliminate dups
rating=rating.drop_duplicates() #row: 2,171,294

In [14]:
# filtering the data with item_id that is erased in item table
items_in_item = item['item_id'].unique()
rating=rating[rating['item_id'].isin(items_in_item)]

In [15]:
# delete users that purchased items only once or twice
user_counts = rating.groupby('user_id').size()

once_users = user_counts[user_counts == 1].index  
twice_users = user_counts[user_counts == 2].index

rating = rating[~rating['user_id'].isin(once_users)]
rating = rating[~rating['user_id'].isin(twice_users)] #row: 268,985

In [16]:
rating = rating.reindex(columns=['user_id', 'item_id', 'rating', 'review'])

In [17]:
rating.to_csv('rating2.csv', index=False)

# Item 2차

In [18]:
# filtering items that are NOT included in rating table
items_in_rating = rating['item_id'].unique()
item = item[item['item_id'].isin(items_in_rating)]

In [19]:
item.to_csv('item2.csv', index=False)

In [22]:
rating

Unnamed: 0,user_id,item_id,rating,review
12,AHREXOGQPZDA6354MHH4ETSF3MCQ,B09DQ5M2BB,2,Not what was expectedThere are lots of raw edg...
13,AHREXOGQPZDA6354MHH4ETSF3MCQ,B095M3HHTJ,5,Nice shirtI really liked this shirt when I rec...
14,AHREXOGQPZDA6354MHH4ETSF3MCQ,B089PWHFVW,3,Not for the large bustedIf you are large chest...
15,AHREXOGQPZDA6354MHH4ETSF3MCQ,B097DQPCP2,5,Cute TI really like this Tshirt. Quality fabr...
16,AHREXOGQPZDA6354MHH4ETSF3MCQ,B092J4ZT1V,5,So softI received this T the other day. I too...
...,...,...,...,...
2500533,AEUD5NAEIO4DX3AFQZQOEBLZ5ZEA,B00GNFBMZI,4,Four StarsLittle large but as pictured for mos...
2500534,AEUD5NAEIO4DX3AFQZQOEBLZ5ZEA,B018XGAOIK,5,Really love this romperReally love this romper...
2500917,AFSZ775MACIF4MBK7YXC7AM35VXQ,B00GQVDUY0,1,would not waste your money on itCHEAP CHEASEY ...
2500918,AFSZ775MACIF4MBK7YXC7AM35VXQ,B00IYF3YFG,4,"love the detailGorgeous necklace, love the det..."


# Sentiment Analysis

In [95]:
texts=list(rating['review'])

In [96]:
# Sentiment Analysis model load
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use cuda:0


In [97]:
# function definition
def sentiment_analysis(classifier, text: str) -> float:
    candidate_labels = ["positive", "neutral", "negative"]
    result = classifier(text, candidate_labels)
    
    # choose highest score label
    highest = result['scores'].index(max(result['scores']))
    
    if result['labels'][highest] == 'neutral':
        second_highest = sorted(range(len(result['scores'])), key=lambda i: result['scores'][i], reverse=True)[1]
        if result['labels'][second_highest]=='positive':
            return result['scores'][second_highest]
        else:
            return -result['scores'][second_highest]
    elif result['labels'][highest] == 'positive':
        return result['scores'][highest]
    else:
        return -result['scores'][highest]

In [98]:
#list to save the result
sentiment_scores=[]

In [None]:
import time

start_time = time.time()

for idx, text in enumerate(texts):
    sentiment_scores.append(sentiment_analysis(classifier, text))
    if (idx+1)%10000==0:
        mid_time = time.time()
        print(f"{idx+1} texts are completed, spending time: {mid_time - start_time} seconds")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total Spending Time: {elapsed_time} seconds")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
if len(sentiment_scores)==rating.shape[0]:
    rating2=rating.copy()
    rating2['sentiment']=sentiment_scores
    rating2.to_csv('rating_with_sentiment.csv', index=False)
else:
    print(len(sentiment_scores), rating.shape[0])