In [18]:
# Import the required packages

import pandas as pd
import numpy as np

In [3]:
# Load in the reviews and filter on reviews for 2023 albums

df = pd.read_csv('reviews.csv') 
df = df[df['review'].isna() == False]
df = df[df['year_released'] == 2023].reset_index(drop=True)

In [36]:
# Add a column to the dataframe with a list of all of the words in each review

df['words'] = df['review'].str.lower().str.split(r'\W+')

# Remove one letter words
def remove_short_words(words):
    words = [word for word in words if len(word) > 2]
    return words

df['words'] = df['words'].apply(remove_short_words) 

In [37]:
# Create a set of vocabulary based on all of the unique words in the reviews

vocab = set()

for words in df['words']:
    for word in words:
        vocab.add(word)
vocab = list(vocab)

In [38]:
# Calculate term frequency for each review

def tf(words, vocab):
    vec = pd.Series(0, index=vocab)
    for word in words:
        if word in vocab:
            vec[word] += 1
    return vec / len(words)

tf_df = df['words'].apply(lambda words: tf(words, vocab))

In [40]:
# Calculate inverse document frequency and multiply to get TF-IDF

idf = pd.Series(0, index=vocab)

for term in vocab:
    doc_count = sum(word in words for words in df['words'])
    idf[term] = np.log((len(df) + 1) / (doc_count + 1)) + 1
    
tf_idf = tf_df * idf

In [43]:
# Create cosine similarity function and calculate similarity matrix

def cosine(a, b):
    return np.dot(a, b)  / (np.linalg.norm(a) * np.linalg.norm(b))

similarity_matrix = pd.DataFrame(0, index=df.index, columns=df.index)
for index1, row1 in tf_idf.iterrows():
    for index2, row2 in tf_idf.iterrows():
        similarity_matrix.iloc[index1, index2] = cosine(row1, row2)

In [60]:
# Save relevant dataframes

similarity_matrix.to_csv('similarity.csv', index=False) 
df.to_csv('filtered_reviews.csv', index=False)