# **Collaborative Filtering**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

In [2]:
path= "./"

In [3]:
pd.set_option('display.max_columns', None)
path="./"

In [5]:
df_ratings = pd.read_csv('dataset/ratings.csv', sep=';')
df_ratings

In [6]:
subset = df_ratings[df_ratings['user'].isin(['lobsterratesig'])]
subset

In [7]:
unique_users_count = df_ratings['user'].nunique()
unique_users_count

In [9]:
df_ratings['link_album'].count()

In [10]:
df_ratings['link_album'].nunique()

In [11]:
df_ratings.columns

In [12]:
plt.figure(figsize=(20, 5))

sns.countplot(x=df_ratings["rating_album"])
plt.xlabel("Rating Album")
plt.ylabel("Frequency")
plt.title("Distribution of Rating Album")

plt.tight_layout()
plt.show()

In [13]:
plt.figure(figsize=(10, 5))

sns.countplot(df_ratings["link_user"])
plt.xlabel("Link User")
plt.ylabel("Frequency")
plt.title("Distribution of Link User")

plt.tight_layout()
plt.show()

In [14]:
interaction_matrix = df_ratings.pivot_table(index='user', columns='link_album', values='rating_album')
interaction_matrix

In [15]:
df_filled = interaction_matrix.fillna(0)
df_filled.T

In [16]:
def standardize(row):
    new_row =(row - row.mean())/(row.max()-row.min())
    return new_row

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

ratings_std= df_filled.apply(standardize)
ratings_std = ratings_std.fillna(0)
item_similarity=cosine_similarity(ratings_std.T)
item_similarity

In [18]:
item_similarity_df = pd.DataFrame(item_similarity,
                                  index=ratings_std.columns,
                                  columns=ratings_std.columns)

item_similarity_df.T

In [19]:
def get_similar_album(album_name, user_rating):
    similar_score = item_similarity_df[album_name]*(user_rating-50)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [20]:
get_similar_album('https://www.albumoftheyear.org/album/100249-the-brobecks-violent-things.php', 80)

In [21]:
def get_similar_more_albums(user_ratings):
    total_scores = pd.Series(dtype=float)
    for album, rating in user_ratings:
        similar_scores = item_similarity_df[album] * (rating - 50)
        total_scores = total_scores.add(similar_scores, fill_value=0)
    total_scores = total_scores.sort_values(ascending=False)
    return total_scores


In [22]:
user_ratings = [
    ("https://www.albumoftheyear.org/album/100249-the-brobecks-violent-things.php", 5),
    ("https://www.albumoftheyear.org/album/100035-young-fathers-cocoa-sugar.php", 100),
    ("https://www.albumoftheyear.org/album/101215-mom-jeans-best-buds.php", 10)
]

In [23]:
#nyoba all taylor swift
user_ratings = [('https://www.albumoftheyear.org/album/541510-taylor-swift-midnights.php',100),
       ('https://www.albumoftheyear.org/album/934464-taylor-swift-the-tortured-poets-department-the-anthology.php', 100),
       ('https://www.albumoftheyear.org/album/313572-taylor-swift-evermore.php',100),
       ('https://www.albumoftheyear.org/album/264058-taylor-swift-folklore.php',100)]

In [24]:
hasil = pd.DataFrame(get_similar_more_albums(user_ratings), columns=['score'])
hasil_data = get_similar_more_albums(user_ratings)
hasil = pd.DataFrame(hasil_data, columns=['score'])
hasil['link_album'] = hasil_data.index
hasil = hasil.reset_index(drop=True)

hasil

In [25]:
hasil

In [27]:
df_albums = pd.read_csv('dataset/albums.csv', sep=';')
df_albums

In [28]:
hasil.dtypes

In [29]:
df_hasil = df_albums.join(hasil.set_index("link_album"), on='link_album')
df_hasil

In [30]:
taylor_swift_data = df_hasil[df_hasil['artis'] == 'Taylor Swift']
taylor_swift_data['link_album'].unique()

In [31]:
sorted = df_hasil.sort_values(by='score', ascending=False)
top_10 =sorted.head(10)
top_10

In [32]:
df_hasil.score.nunique()

# **Content Based Filtering**

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
tabel = pd.DataFrame({
    'link_album': df_albums['link_album'],
    'genre': df_albums['genre'],
    'artis': df_albums['artis'],
    'label': df_albums['label'],
    'produser': df_albums['produser'],
    'penulis': df_albums['penulis']
})

tabel = tabel.apply(lambda x: x.str.replace(' ', '_'))
tabel = tabel.apply(lambda x: x.str.replace('!', ''))
tabel = tabel.apply(lambda x: x.str.replace('?', ''))
tabel = tabel.apply(lambda x: x.str.replace('/', ''))
tabel = tabel.apply(lambda x: x.str.replace('-', '_'))
tabel = tabel.apply(lambda x: x.str.replace("'", '_'))
tabel = tabel.apply(lambda x: x.str.replace(';|', ' '))

combined = pd.DataFrame({
    'link_album': tabel['link_album'],
    'corpus': tabel[['genre', 'artis', 'label', 'produser', 'penulis']].apply(lambda x: ' '.join(map(str, x)), axis=1)
})

combined = combined.set_index('link_album')
combined = combined.apply(lambda x: x.str.replace('nan', ''))
combined

In [35]:
corpus=combined.corpus.tolist()
corpus

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_similarity(query):
    tfidf_vectorizer = TfidfVectorizer()
    corpus = combined['corpus'].values
    tfidf_vectorizer.fit(corpus)
    query_tfidf = tfidf_vectorizer.transform([query])
    corpus_tfidf = tfidf_vectorizer.transform(corpus)

    similarity_scores = query_tfidf.dot(corpus_tfidf.T)

    similarity_scores_dense = similarity_scores.toarray()

    sorted_indices = np.argsort(similarity_scores_dense)[0][::-1]

    relevant_links = combined.index[sorted_indices].tolist()

    tfidf_scores = similarity_scores_dense[0][sorted_indices]

    result = pd.DataFrame({'link_album': relevant_links, 'tfidf_score': tfidf_scores}).set_index('link_album')
    result = result.apply(lambda x: x.str.replace('_', '-'))
    return result

In [39]:
links = [
    'https://www.albumoftheyear.org/album/541510-taylor-swift-midnights.php',
    'https://www.albumoftheyear.org/album/934464-taylor-swift-the-tortured-poets-department-the-anthology.php',
    'https://www.albumoftheyear.org/album/313572-taylor-swift-evermore.php',
    'https://www.albumoftheyear.org/album/264058-taylor-swift-folklore.php'
    ]

for i in range(len(links)):
    links[i] = links[i].replace("-", "_")

query = combined.loc[combined.index.isin(links)]
column_values = query['corpus'].astype(str)
combined_string = ' '.join(column_values)
words = combined_string.split()
unique_words = list(set(words))
query = ' '.join(unique_words)
result= tfidf_similarity(query)
result = result.drop(result.index[:len(links)])
result = result.apply(lambda x: x.str.replace('_', '-'))
result.head(10)