## **Data Pre-Processing**

### **Import Library and Dataset**

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
!pwd

/content


In [6]:
data = pd.read_csv('drive/My Drive/skincare/skincares.csv')
# data = pd.read_csv('data_sampling.csv')

### **Data Cleansing**

In [7]:
data = data[['user', 'rate', 'id']]
data

Unnamed: 0,user,rate,id
0,niluhseptiany,2,29993
1,f_ar,5,29993
2,Melinasilfia,2,29993
3,Annacute,4,29993
4,monissacitra,3,29993
...,...,...,...
82517,yuradithaap,5,91834
82518,yuradithaap,5,82360
82519,yuradithaap,5,87682
82520,yuradithaap,3,76067


In [9]:
data_clean = data.dropna().drop_duplicates()

In [10]:
data_clean = data_clean[~(data_clean['user'] == ' ')]

In [11]:
id_count = pd.crosstab(index=data_clean.user,columns='count').sort_values(by='count',ascending=True)

name_r = id_count[id_count['count']>1]
name_u = name_r.index.to_list()
data_clean = data_clean[data_clean.user.isin(name_u)]

In [13]:
data_clean['rate'] = pd.to_numeric(data_clean['rate'], errors='coerce')
data_clean

Unnamed: 0,user,rate,id
0,niluhseptiany,2.0,29993
1,f_ar,5.0,29993
2,Melinasilfia,2.0,29993
4,monissacitra,3.0,29993
5,Anggitakarima,4.0,29993
...,...,...,...
82517,yuradithaap,5.0,91834
82518,yuradithaap,5.0,82360
82519,yuradithaap,5.0,87682
82520,yuradithaap,3.0,76067


## **Similarity**

### **Transform Data into Matrix**

In [14]:
data_pivot = pd.pivot_table(data_clean, values='rate', index='user', columns='id').fillna(0)
data_pivot

id,100288,100477,100544,100897,101206,101207,101209,101364,101414,101439,...,98643,98644,98648,98778,99073,99074,99476,99482,99483,99486
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01lely,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01yupe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
04Kiky,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
082234521020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
08_dwifadillah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuvie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zweiazzahra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zynnn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyxyayeaye,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **Pearson Correlation**

In [15]:
corr = data_pivot.T.corr().round(2)
corr

user,01lely,01yupe,04Kiky,082234521020,08_dwifadillah,0kt4v1ana,12tom12,15Rania,18novitasari11,1nandayoo,...,zulvinaar,zulvveh,zuma,zurayoochun,zuuuuu,zuvie,zweiazzahra,zynnn,zyxyayeaye,zzarah
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01lely,1.00,-0.01,-0.01,-0.01,0.14,-0.01,-0.01,-0.01,-0.01,-0.01,...,-0.01,-0.02,-0.01,-0.01,-0.01,-0.02,-0.01,-0.01,-0.01,-0.01
01yupe,-0.01,1.00,-0.01,1.00,-0.00,-0.00,-0.01,-0.00,-0.00,-0.01,...,-0.00,-0.01,-0.01,-0.00,-0.01,-0.01,-0.00,-0.00,-0.01,-0.00
04Kiky,-0.01,-0.01,1.00,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,...,-0.01,0.21,-0.01,-0.01,0.27,-0.01,-0.01,0.36,-0.01,-0.01
082234521020,-0.01,1.00,-0.01,1.00,-0.00,-0.00,-0.01,-0.00,-0.00,-0.01,...,-0.00,-0.01,-0.01,-0.00,-0.01,-0.01,-0.00,-0.00,-0.01,-0.00
08_dwifadillah,0.14,-0.00,-0.01,-0.00,1.00,-0.00,-0.01,-0.00,-0.00,-0.01,...,-0.00,0.32,-0.01,-0.00,-0.01,-0.01,-0.00,-0.00,-0.01,-0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuvie,-0.02,-0.01,-0.01,-0.01,-0.01,-0.01,0.03,-0.01,-0.01,0.17,...,-0.01,-0.02,-0.01,-0.01,0.07,1.00,-0.01,-0.01,0.14,0.22
zweiazzahra,-0.01,-0.00,-0.01,-0.00,-0.00,-0.00,-0.01,-0.00,-0.00,-0.01,...,-0.00,-0.01,-0.01,0.50,-0.01,-0.01,1.00,-0.00,-0.01,-0.00
zynnn,-0.01,-0.00,0.36,-0.00,-0.00,-0.00,-0.01,-0.00,-0.00,-0.01,...,-0.00,-0.01,-0.01,-0.00,-0.01,-0.01,-0.00,1.00,-0.01,-0.00
zyxyayeaye,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,...,-0.01,-0.02,-0.01,-0.01,-0.01,0.14,-0.01,-0.01,1.00,-0.01


## **Positive Reviews**

### **Import Dataset**

In [16]:
data = pd.read_csv('drive/My Drive/skincare/skincares.csv')

In [17]:
data = data.dropna().drop_duplicates()

### **Data Pre-Processing**

In [18]:
def lowercase(review_text):
  low = review_text.lower()
  return low

data['reviews'] = data['reviews'].apply(lambda low:lowercase(str(low)))

In [19]:
import re
def remove_emoji(review_text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', review_text)

data['reviews'] = data['reviews'].apply(lambda emoji: remove_emoji(emoji))

In [20]:
def remove_hashtag(review_text, default_replace=""):
  hashtag = re.sub(r'#\w+', default_replace, review_text)
  return hashtag

data['reviews'] = data['reviews'].apply(lambda hashtag: remove_hashtag(hashtag))

In [21]:
def remove_number(review_text, default_replace=" "):
  num = re.sub(r'\d+', default_replace, review_text)
  return num

data['reviews'] = data['reviews'].apply(lambda num: remove_number(num))

In [22]:
import string
def remove_punctuation(review_text, default_text=" "):
  list_punct = string.punctuation
  delete_punct = str.maketrans(list_punct,' '*len(list_punct))
  new_review = ' '.join(review_text.translate(delete_punct).split())

  return new_review

data['reviews'] = data['reviews'].apply(lambda punct: remove_punctuation(punct))

In [23]:
def remove_superscript(review_text):
  number = re.compile("["u"\U00002070"
                      u"\U000000B9"
                      u"\U000000B2-\U000000B3"
                      u"\U00002074-\U00002079"
                      u"\U0000207A-\U0000207E"
                      u"U0000200D"
                      "]+", flags=re.UNICODE)
  return number.sub(r'', review_text)

data['reviews'] = data['reviews'].apply(lambda num: remove_superscript(num))

In [24]:
def word_repetition(review_text):
  review = re.sub(r'(.)\1+', r'\1\1', review_text)
  return review

data['reviews'] = data['reviews'].apply(lambda word: word_repetition(word))

In [25]:
def repetition(review_text):
  repeat = re.sub(r'\b(\w+)(?:\W\1\b)+', r'\1',review_text, flags=re.IGNORECASE)
  return repeat

data['reviews'] = data['reviews'].apply(lambda word: repetition(word))

In [26]:
def remove_extra_whitespaces(review_text):
  review = re.sub(r'\s+',' ', review_text)
  return review

data['reviews'] = data['reviews'].apply(lambda extra_spaces: remove_extra_whitespaces(extra_spaces))

In [27]:
reviews = data[['id', 'user', 'rate','reviews']]

In [28]:
lexicon_positive = pd.read_csv('drive/My Drive/skincare/kata_positif.csv')
lexicon_positive_dict = {}
for index, row in lexicon_positive.iterrows():
    if row[0] not in lexicon_positive_dict:
        lexicon_positive_dict[row[0]] = row[1]

lexicon_negative = pd.read_csv('drive/My Drive/skincare/kata_negatif.csv')
lexicon_negative_dict = {}
for index, row in lexicon_negative.iterrows():
    if row[0] not in lexicon_negative_dict:
        lexicon_negative_dict[row[0]] = row[1]

  if row[0] not in lexicon_positive_dict:
  lexicon_positive_dict[row[0]] = row[1]
  if row[0] not in lexicon_negative_dict:
  lexicon_negative_dict[row[0]] = row[1]


In [29]:
reviews['rate'] = pd.to_numeric(reviews['rate'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['rate'] = pd.to_numeric(reviews['rate'], errors='coerce')


### **Count Positive Reviews**

In [30]:
def sentiment_analysis_lexicon_indonesia(ulasan, rating):
        if isinstance(ulasan, str):
            score = 0
            for word in ulasan.split():
                if word in lexicon_positive_dict:
                    score += lexicon_positive_dict[word]
                if word in lexicon_negative_dict:
                    score += lexicon_negative_dict[word]

            if rating >= 4:
                score *= 0.8
                score = np.abs(score)
            elif rating <= 3:
                score *= 0.2

            sentiment = 'positif' if score > 0 else ('negatif' if score < 0 else 'netral')
            return score, sentiment
        else:
            return None, 'tidak valid'

results = reviews.apply(lambda row: sentiment_analysis_lexicon_indonesia(row['reviews'], row['rate']), axis=1)
results = list(zip(*results))

In [31]:
reviews['skor'] = results[0]
reviews['label'] = results[1]
new_reviews = reviews[['id','reviews','label']]

In [46]:
produk_positif = new_reviews[new_reviews['label'] == 'positif'].groupby(['id']).size().reset_index(name='positif_reviews')
produk_positif

Unnamed: 0,id,positif_reviews
0,100288,10
1,100455,2
2,100477,38
3,100544,83
4,100897,5
...,...,...
448,99074,87
449,99476,17
450,99482,3
451,99483,4


In [47]:
total_reviews_per_product = reviews.groupby('id')['reviews'].count()
threshold = total_reviews_per_product * 0.5
top_produk_positif = []
for product in total_reviews_per_product.index:
    product_threshold = threshold[product]
    produk = produk_positif[(produk_positif['id'] == product) & (produk_positif['positif_reviews'] >= product_threshold)]
    top_produk_positif.append(produk)
top_produk_positif = pd.concat(top_produk_positif)
top_produk_positif['positif_reviews'] = pd.to_numeric(top_produk_positif['positif_reviews'], errors='coerce')
top_produk_positif = top_produk_positif.set_index('id')['positif_reviews'].to_dict()

In [48]:
top_produk_positif_df = pd.DataFrame.from_dict(top_produk_positif, orient='index', columns=['Positive Reviews'])
top_produk_positif_df

Unnamed: 0,Positive Reviews
100288,10
100455,2
100477,38
100544,83
100897,5
...,...
99074,87
99476,17
99482,3
99483,4


## **Calculate Prediction**

### **Weighted Sum**

In [49]:
def predict_ratings_for_user(similarity_matrix, user_ratings, user_id, produk_positif):
    predicted_ratings = {}
    for product_p in user_ratings.columns:
        prediction = weighted_sum_similarity(similarity_matrix, user_ratings, user_id, product_p)
        if product_p in produk_positif:
          predicted_ratings[product_p] = prediction
    return predicted_ratings

In [122]:
def weighted_sum_similarity(similarity_matrix, user_ratings, user_i, product_p):
    similarities = similarity_matrix[user_i]
    other_user_ratings = user_ratings.loc[:, product_p]
    positive_similarities = similarities[similarities > 0.3]
    weighted_sum_similarity = np.sum(positive_similarities * other_user_ratings[similarities > 0.3]) / np.sum(
        np.abs(positive_similarities))
    return weighted_sum_similarity

### **Input User ID**

In [123]:
user_id = str(input("Enter the user id to whom you want to recommend: "))
# user_id = 'yura'

Enter the user id to whom you want to recommend: akangskincare


In [124]:
predicted_ratings = predict_ratings_for_user(corr, data_pivot, user_id, top_produk_positif)

In [125]:
df = pd.DataFrame(list(predicted_ratings.items()), columns=['id', 'predicted_rate']).round(2)
df1 = pd.DataFrame(list(top_produk_positif.items()), columns=['id', 'positif_reviews'])

In [126]:
product = pd.merge(df, df1, on='id', how='left')
product

Unnamed: 0,id,predicted_rate,positif_reviews
0,100288,0.00,10
1,100477,0.00,38
2,100544,0.00,83
3,100897,0.00,5
4,101206,0.95,36
...,...,...,...
442,99074,4.76,87
443,99476,0.00,17
444,99482,0.00,3
445,99483,0.00,4


### **Sorted Predicted Rate Then Positive Reviews**

In [127]:
product = product[product['predicted_rate'] > 0]
top_predicted = product.sort_values(by=['predicted_rate','positif_reviews'], ascending=[False, False])
top_predicted_dict = top_predicted.set_index('id')[['predicted_rate']].apply(tuple, axis=1).to_dict()

In [128]:
product = df[df['predicted_rate'] > 0]
top_predicted_dict = product.set_index('id')[['predicted_rate']].apply(tuple, axis=1).to_dict()

In [129]:
actual_ratings = data_pivot.loc[user_id]
rated_products = actual_ratings[actual_ratings != 0]
rated_products_df = pd.DataFrame(rated_products)
rated_products_df.columns = ['rate']
rated_products_df = rated_products_df.reset_index()
rated_products_tuples = [tuple(row) for row in rated_products_df.values]

In [130]:
print(f"Produk yang sudah dirating oleh pengguna {user_id}:")
rated_products_df

Produk yang sudah dirating oleh pengguna akangskincare:


Unnamed: 0,id,rate
0,18167,3.0
1,35258,3.0
2,46388,5.0
3,48129,3.0
4,51667,5.0
...,...,...
73,98398,5.0
74,98643,4.0
75,98644,4.0
76,99073,5.0


In [131]:
print(f"Produk yang direkomendasikan untuk pengguna {user_id}:")
top_predicted = top_predicted.head(10)
top_predicted

Produk yang direkomendasikan untuk pengguna akangskincare:


Unnamed: 0,id,predicted_rate,positif_reviews
338,89590,5.0,411
164,69896,5.0,378
335,89336,5.0,366
433,98398,5.0,289
402,94749,5.0,263
411,95584,5.0,22
96,51667,4.76,946
233,79706,4.76,776
442,99074,4.76,87
302,86796,4.53,211


### **MAE and Precision**

In [132]:
def precision_at_k(predicted_ratings, actual_ratings, k):
    top_predicted_products = dict(sorted(predicted_ratings.items(), key=lambda item: item[1], reverse=True)[:k])
    rated_products = set(actual_ratings.keys())
    relevant_products = set(top_predicted_products.keys()).intersection(rated_products)
    precision = (len(relevant_products )/ k)
    return precision

In [133]:
def mean_absolute_error(predicted_ratings, actual_ratings):
    errors = []
    for product, prediction in predicted_ratings.items():
        if product in actual_ratings:
            error = np.abs(prediction - actual_ratings[product])
            errors.append(error)
    mae = np.mean(errors).round(2)
    return mae

In [134]:
mae = mean_absolute_error(top_predicted_dict, rated_products)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 0.84


In [135]:
k = 10
precision = precision_at_k(top_predicted_dict, rated_products, k)
print(f"Precision at {k}: {precision}")

Precision at 10: 1.0
