In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
import nltk
import spacy
import string
import re
import warnings
import json


from nltk.corpus import stopwords
from nltk.collocations import *
from collections import Counter
from nltk.stem import porter
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC







nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('genesis')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
stemmer = porter.PorterStemmer()

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('brown')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## Importing 'reviews' dataset

In [2]:
from keras.utils import get_file

dir1 = get_file(origin="https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/reviews_Digital_Music_5.json.gz?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q")
dir1

import shutil
shutil.move(dir1, "./reviews_Digital_Music_5.json.gz")

Downloading data from https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/reviews_Digital_Music_5.json.gz?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q


'./reviews_Digital_Music_5.json.gz'

In [3]:
full_df = pd.read_json("reviews_Digital_Music_5.json.gz", orient='records',lines=True)


In [4]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64706 entries, 0 to 64705
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewerID      64706 non-null  object
 1   asin            64706 non-null  object
 2   reviewerName    64529 non-null  object
 3   helpful         64706 non-null  object
 4   reviewText      64706 non-null  object
 5   overall         64706 non-null  int64 
 6   summary         64706 non-null  object
 7   unixReviewTime  64706 non-null  int64 
 8   reviewTime      64706 non-null  object
dtypes: int64(2), object(7)
memory usage: 4.4+ MB


In [5]:
full_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3EBHHCZO6V2A4,5555991584,"Amaranth ""music fan""","[3, 3]","It's hard to believe ""Memory of Trees"" came ou...",5,Enya's last great album,1158019200,"09 12, 2006"
1,AZPWAXJG9OJXV,5555991584,bethtexas,"[0, 0]","A clasically-styled and introverted album, Mem...",5,Enya at her most elegant,991526400,"06 3, 2001"
2,A38IRL0X2T4DPF,5555991584,bob turnley,"[2, 2]",I never thought Enya would reach the sublime h...,5,The best so far,1058140800,"07 14, 2003"
3,A22IK3I6U76GX0,5555991584,Calle,"[1, 1]",This is the third review of an irish album I w...,5,Ireland produces good music.,957312000,"05 3, 2000"
4,A1AISPOIIHTHXX,5555991584,"Cloud ""...""","[1, 1]","Enya, despite being a successful recording art...",4,4.5; music to dream to,1200528000,"01 17, 2008"


In [6]:
#creating Sentiment column with overall rating
full_df['Sentiment'] = np.where((full_df['overall'] > 3), 'Positive', 
                          np.where((full_df['overall'] < 3), 'Negative', 'Neutral'))


In [7]:
#showing that the data is not balanced
full_df['Sentiment'].value_counts() 

Positive    52116
Neutral      6789
Negative     5801
Name: Sentiment, dtype: int64

In [8]:
#creating  'NewReview'  column using  'reviewText' and 'summary' columns
cols = ['reviewText','summary']
full_df['NewReview'] = full_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
full_df['NewReview'] = full_df['NewReview'].astype(str)
full_df['review_length'] = full_df['NewReview'].apply(lambda x: len(x.split()))

In [9]:
music_df = full_df.drop(columns=['reviewTime','reviewerName','unixReviewTime','summary','reviewText'])

In [10]:
#music_df = music_df.sample(n=6000, random_state=42)

## clean text

In [11]:
#!pip install demoji
#!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993243 sha256=07e4c642ad336712da08e1f40dae2d5dc7751f4c590ed5049bfbc6c18ef6dcd8
  Stored 

In [12]:
###

#drop non-English comments
from langdetect import detect

def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

music_df = music_df[music_df['NewReview'].apply(is_english)]



In [None]:

#music_df.to_json("music_df_en_only.json.gz", orient='records',lines=True)

In [13]:
import demoji
from nltk.stem import *

# Initialize stopwords
STOPWORDS = set(stopwords.words('english'))
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define function for text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove currency symbols
    text = re.sub(r'£|\$', '', text)

    # Remove phone numbers
    text = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove punctuation and special characters except emojis
    #text = re.sub(r'[^\w\s' + emoji.get_emoji_regexp() + ']', '', text)
    text = demoji.replace(text, '')

    # Tokenize text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and handle negation
    negation_words = ['not', 'no', 'never', 'neither', 'nor']
    words = [word if word.lower() in negation_words else word.lower() for word in words]
    words = [word for word in words if word.lower() not in STOPWORDS]
    words = [f'NOT_{words[i+1]}' if (i < len(words)-1 and words[i].lower() in negation_words and words[i+1] not in string.punctuation) else words[i] for i in range(len(words))]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    # Stem words
    words = [stemmer.stem(word) for word in words]

    # Join words back into text
    cleaned_text = ' '.join(words)
    return cleaned_text


In [14]:
###

from multiprocessing import cpu_count, Pool
pool = Pool(cpu_count())
tasks = music_df['NewReview'].tolist()
done = pool.map(clean_text, tasks)  # parallel processing
music_df['processed_review'] = done

In [15]:
music_df = music_df.drop(columns=['NewReview'])

In [None]:
# code for saving
#music_df.to_json("music_df_en_only_cleaned.json.gz", orient='records',lines=True)

### skip route A: load cleaned df

In [30]:
from keras.utils import get_file

dir1 = get_file(origin="https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/music_df_en_only_cleaned.json.gz?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q")
dir1

import shutil
shutil.move(dir1, "./music_df_en_only_cleaned.json.gz")

Downloading data from https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/music_df_en_only_cleaned.json.gz?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q


'./music_df_en_only_cleaned.json.gz'

In [31]:
music_df = pd.read_json("music_df_en_only_cleaned.json.gz", orient='records',lines=True)


In [32]:
music_df.head()

Unnamed: 0,reviewerID,asin,helpful,overall,Sentiment,review_length,processed_review
0,A3EBHHCZO6V2A4,5555991584,"[3, 3]",5,Positive,157,'s hard believ `` memori tree '' came 11 year ...
1,AZPWAXJG9OJXV,5555991584,"[0, 0]",5,Positive,105,"clasically-styl introvert album , memori tree ..."
2,A38IRL0X2T4DPF,5555991584,"[2, 2]",5,Positive,88,not_thought thought enya would reach sublim he...
3,A22IK3I6U76GX0,5555991584,"[1, 1]",5,Positive,95,third review irish album write today ( other c...
4,A1AISPOIIHTHXX,5555991584,"[1, 1]",4,Positive,719,"enya , despit success record artist , n't broa..."


### Load metadata (for 'items')

In [17]:
dir1 = get_file(origin="https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/meta_Digital_Music.json.gz?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q")
dir1

shutil.move(dir1, "./meta_Digital_Music.json.gz")

Downloading data from https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/meta_Digital_Music.json.gz?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q


'./meta_Digital_Music.json.gz'

In [18]:
import ast
import html

path = 'meta_Digital_Music.json.gz'
i = 0
df_dict = {}

search_full = ""

jgz = gzip.open(path, 'rb')
for line in jgz:
    i += 1
    #line, _ = re.subn(r'\'','"',line.decode('utf-8'))
    #d = json.loads(line)
    d = ast.literal_eval(line.decode('utf-8'))
    
    if 'title' not in d or not len(d['title']): continue #remove empty title

    #if re.search(r'[<>]',d['title']): continue #remove inviald title that only got html tag
    d['title'] = html.unescape(d['title'])
    
    if d['asin'] in df_dict: continue #remove duplicate items

    df_dict[d['asin']] = d 

mdata_df = pd.DataFrame.from_dict(df_dict, orient='index')
print(i)

279899


In [19]:
mdata_df.shape

(7321, 9)

In [20]:
mdata_df.head()

Unnamed: 0,asin,title,price,imUrl,related,salesRank,categories,description,brand
5555991584,5555991584,Memory of Trees,9.49,http://ecx.images-amazon.com/images/I/51b5WDjd...,"{'also_bought': ['B000002LRT', 'B000002LRR', '...",{'Music': 939190},"[[CDs & Vinyl, New Age, Celtic New Age], [CDs ...",,
6308051551,6308051551,Don't Drink His Blood,8.91,http://ecx.images-amazon.com/images/I/31LT2n7Q...,,,"[[Digital Music, Alternative Rock, Indie & Lo-...",NEW Combo BLUWAVS CD and FLAC FILE,
7901622466,7901622466,On Fire,11.33,http://ecx.images-amazon.com/images/I/21W29WZw...,"{'also_bought': ['B00000282A', 'B0084O8O9S', '...",{'Music': 58799},"[[CDs & Vinyl, Christian, Rock & Alternative],...",,
B0000000ZW,B0000000ZW,Changing Faces,23.64,http://ecx.images-amazon.com/images/I/51H3Bp-3...,"{'also_bought': ['B00000010Z', 'B0000039Q5', '...",{'Music': 68784},"[[CDs & Vinyl, Dance & Electronic], [CDs & Vin...",,
B00000016W,B00000016W,Pet Sounds,9.49,http://ecx.images-amazon.com/images/I/51pDGkXj...,"{'also_bought': ['B0025KVLTM', 'B00007FOMP', '...",{'Music': 77205},"[[CDs & Vinyl, Classic Rock], [CDs & Vinyl, Po...",,


##  Recommender -
### inspired by: "Hariri N et el (2011) Context-aware recommendation based on review mining"
### with some modifications, e.g.: using LDA instead of L-LDA, due to we don't have labels in the data etc.

### Latent Dirichlet Allocation (LDA)

In [47]:
n_sample = 60000 # @param {type:"integer"}
music_df_s = music_df.sample(n=n_sample , random_state=42)

In [41]:
music_df.shape

(64630, 7)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(analyzer='word',max_df=1.0)
tfidf_matrix = tfv.fit_transform(music_df_s['processed_review'])
tfidf_matrix.shape

(6000, 28634)

In [45]:
vocab = tfv.get_feature_names_out()
#vocab[:90]

In [49]:
n_topics = 20 # @param {type:"integer"}

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=n_topics,max_iter=20,random_state=42,n_jobs=-1)
topics_of_reviews = lda.fit_transform(tfidf_matrix) 
topics_of_reviews.shape

(6000, 20)

In [27]:
topic_words = lda.components_

for topic in topic_words:
  st = np.argsort(topic)
  words = np.array(vocab)[st]
  print(words[:-9:-1])

['odb' 'cathi' 'denni' 'poppa' 'olivia' 'muscl' 'ant' 'combo']
['jimi' 'hendrix' 'weezer' 'haze' 'feminem' 'cl' 'mitch' 'curti']
['opeth' 'morningris' 'maxwel' 'pra' 'puck' 'rul' 'mjg' 'not_compil']
['cannonbal' 'omg' 'jem' 'nyro' 'autumn' 'porcupin' 'zandt' 'radioact']
['2001' 'chronic' 'waylon' 'jamiroquai' 'sermon' 'albulm' 'massacr' 'hort']
['devo' '52' 'imbruglia' '54' 'chamillionair' '59' '512' 'huff']
['sweat' 'lightweight' 'oscar' 'luda' 'gr8' 'loverboy' 'cher' 'daughtri']
['imagin' 'joni' 'composit' 'thriller' 'mccartney' 'hall' 'costello'
 'elvi']
['taupin' 'captain' 'madman' 'duffi' 'freeway' 'tumblewe' 'chateau'
 'bleek']
['swoon' 'diddi' 'silversun' 'porcupin' 'beatz' 'marvin' 'penni' 'court']
['prison' 'blondi' 'jen' 'folsom' 'bolton' 'carli' 'cher' 'superstit']
['phil' 'outkast' 'collin' 'quik' 'murder' 'marshal' 'luda' 'twista']
['abba' 'chic' '8217' 'letoya' 'duran' 'rihanna' 'bootleg' 'clay']
['quot' 'album' 'song' 'like' 'cd' 'one' 'great' 'good']
['clay' 'gould' 'do

### k Nearest Neighbors (k-NN)

In [50]:
found, no_mdata = 0,0
for asin, rv in zip(music_df_s['asin'],topics_of_reviews):
    if not asin or asin not in df_dict: no_mdata+=1; continue
    df_dict[asin]['rv_topics'] = rv if 'rv_topics' not in df_dict[asin] else np.vstack((df_dict[asin]['rv_topics'],rv))
    found+=1
found, no_mdata

(5007, 993)

In [51]:
items_w_rv = dict()
topic_matrix = np.empty((0,n_topics), 'float64')
title_lookup = dict()
asin_lookup = dict()

idx = 0 
for d in df_dict.values():
  if 'rv_topics' in d:
    if len(d['rv_topics'].shape)>1: d['rv_topics'] = np.mean(d['rv_topics'],axis=0)
    topic_matrix = np.vstack((topic_matrix, d['rv_topics']))
    items_w_rv[idx] = d
    title_lookup[d['title'].lower()] = idx
    asin_lookup[d['asin']] = idx
    idx+=1

topic_matrix.shape


(2179, 20)

In [52]:
import pickle
items_to_pickle = (items_w_rv , topic_matrix , title_lookup , asin_lookup)
#with open("lda_model.pickle", 'wb') as pkf:
#  pickle.dump(items_to_pickle, pkf)

### skip route B - like... almost done

In [53]:
from keras.utils import get_file

dir1 = get_file(origin="https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/lda_model.pickle?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q")
dir1

import shutil
shutil.move(dir1, "./lda_model.pickle")

Downloading data from https://raw.githubusercontent.com/shahrzadko/NLPDigitalMusic/Recommender/lda_model.pickle?token=GHSAT0AAAAAAB6LOL42SKHPCCMIVYPOC4REY7AKE2Q


'./lda_model.pickle'

In [54]:
## load model option
import pandas as pd

import pickle
with open("lda_model.pickle", 'rb') as pkf:
  (items_w_rv , topic_matrix , title_lookup , asin_lookup) = pickle.load(pkf)

In [55]:
pd.DataFrame.from_dict(items_w_rv, orient='index')

Unnamed: 0,asin,title,price,imUrl,related,salesRank,categories,rv_topics,brand,description
0,5555991584,Memory of Trees,9.49,http://ecx.images-amazon.com/images/I/51b5WDjd...,"{'also_bought': ['B000002LRT', 'B000002LRR', '...",{'Music': 939190},"[[CDs & Vinyl, New Age, Celtic New Age], [CDs ...","[0.005944301246778397, 0.005724376118965009, 0...",,
1,B0000000ZW,Changing Faces,23.64,http://ecx.images-amazon.com/images/I/51H3Bp-3...,"{'also_bought': ['B00000010Z', 'B0000039Q5', '...",{'Music': 68784},"[[CDs & Vinyl, Dance & Electronic], [CDs & Vin...","[0.007947564104963068, 0.007947564145385768, 0...",,
2,B00000016W,Pet Sounds,9.49,http://ecx.images-amazon.com/images/I/51pDGkXj...,"{'also_bought': ['B0025KVLTM', 'B00007FOMP', '...",{'Music': 77205},"[[CDs & Vinyl, Classic Rock], [CDs & Vinyl, Po...","[0.008476647700033773, 0.007341975014550384, 0...",,
3,B00000016T,Cars,14.63,http://ecx.images-amazon.com/images/I/51pmW%2B...,"{'also_bought': ['B000002GWX', 'B000002GXD', '...",{'Music': 458151},"[[CDs & Vinyl, Alternative Rock, New Wave & Po...","[0.00804670266352813, 0.009927344553832798, 0....",,
4,B00000017R,Jazz Samba,8.99,http://ecx.images-amazon.com/images/I/51kAU0Ty...,"{'also_bought': ['B0000046WB', 'B0000047CX', '...",{'Music': 561174},"[[CDs & Vinyl, Jazz, Bebop], [CDs & Vinyl, Jaz...","[0.007089352362394035, 0.006163632120142277, 0...",,
...,...,...,...,...,...,...,...,...,...,...
2665,B000LXHGBC,Back Numbers,11.88,http://ecx.images-amazon.com/images/I/51N%2ByT...,"{'also_bought': ['B00005YN6S', 'B00HVF1344', '...",{'Music': 124106},"[[CDs & Vinyl, Alternative Rock, Indie & Lo-Fi...","[0.006629234050171659, 0.006629234067383804, 0...",,The thoroughly excellent sophomore release by ...
2666,B000M5B6GK,All of a Sudden I Miss Everyone,9.49,http://ecx.images-amazon.com/images/I/618p7PKX...,"{'also_bought': ['B0006Q2TPC', 'B0000DJYMF', '...",{'Music': 640730},"[[CDs & Vinyl, Alternative Rock, Indie & Lo-Fi...","[0.006477516765231249, 0.005191601906141134, 0...",,Sometimes Explosions in the Sky start with a w...
2667,B000MGBTIE,Alpha,13.83,http://ecx.images-amazon.com/images/I/51NLyLG1...,"{'also_bought': ['B000B8QF6O', 'B0013K1ALY', '...",{'Music': 5783},"[[CDs & Vinyl, Alternative Rock, Goth & Indust...","[0.00709600084750042, 0.005779527698819136, 0....",Alpha,"Alpha, the latest outing from this veteran act..."
2668,B000MGUZ9I,Undiscovered,10.88,http://ecx.images-amazon.com/images/I/51SAKEc0...,"{'also_bought': ['B001EOQTV0', 'B005ILWOZA', '...",{'Music': 8872},"[[CDs & Vinyl, Alternative Rock], [CDs & Vinyl...","[0.005569142681427098, 0.005569142662887866, 0...",,"Young James Morrison has ""success story"" writt..."


In [56]:
n_recommendations = 10 # @param {type:"integer"}

from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=n_recommendations+1,n_jobs=-1)
knn.fit(topic_matrix)


In [57]:
def recommand_by_title(title):
  if title not in title_lookup: 
    print("\nSorry, we don’t have recommendations for", title)
    return
  print(f"\nRecommandations for - {title}:\n")

  idx = title_lookup[title]
  recom_list = knn.kneighbors([topic_matrix[idx]],return_distance=False)[0][1:]
  for ridx in recom_list:
    print(items_w_rv[ridx]['title'])


### recommand loop (alest run form skip route b)

In [58]:
print("\nInput a title (use 'exit' to exit): ", end='')
user_input = input().lower()

while(user_input and user_input != 'exit'):
  recommand_by_title(user_input)

  print("\nInput a title (use 'exit' to exit): ", end='')
  user_input = input().lower()


Input a title (use 'exit' to exit): Undiscovered

Recommandations for - undiscovered:

Talk Talk Talk
Hearts and Bones
Paradise & Lunch
Nested
Both Sides
Chinese Wall
Living in Clip
Brand New Me
Everglow
Non-Stop Erotic Cabaret

Input a title (use 'exit' to exit): 
