## hetrec2011-lastfm-2k EDA

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
from ydata_profiling import ProfileReport
from pathlib import Path

### user_artists.dat
This file contains the artists listened by each user.
        
It also provides a listening count for each [user, artist] pair

userID \t artistID \t weight
2	51	13883

In [2]:
user_artists = pd.read_csv(Path() / "../../data/raw/hetrec2011-lastfm-2k/user_artists.dat", sep="\t")

In [7]:
ids = set(user_artists.userID)

In [8]:
user_artists.userID

1892

In [None]:
user_artists_report = ProfileReport(user_artists, title="User-artists dataset report")
user_artists_report.to_file("../reports/user_artists_profile_report.html")

### user_friends.dat
These files contain the friend relations between users in the database.

userID \t friendID
2	275

In [None]:
user_friends = pd.read_csv(Path() / "../../data/raw/hetrec2011-lastfm-2k/user_friends.dat", sep="\t")

In [None]:
user_friends_report = ProfileReport(user_friends, title="User-friends dataset report")
user_friends_report.to_file("../reports/user_friends_profile_report.html")

### user_taggedartists-timestamps.dat
These files contain the tag assignments of artists provided by each particular user.
        
They also contain the timestamps when the tag assignments were done.

user_taggedartists-timestamps.dat

userID \t artistID \t tagID \t timestamp
2	52	13	1238536800000

In [None]:
user_taggedartists = pd.read_csv(Path() / "../../data/raw/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat", sep="\t")

In [None]:
user_taggedartists_report = ProfileReport(user_taggedartists, title="User tagged-artists dataset report")
user_taggedartists_report.to_file("../reports/user_taggedartists_profile_report.html")

### tags.dat
This file contains the set of tags available in the dataset.

tagID \t tagValue
1	metal

In [None]:
tags = pd.read_csv(Path() / "../../data/raw/hetrec2011-lastfm-2k/tags.dat", sep="\t", encoding='latin-1')

In [None]:
tags_report = ProfileReport(tags, title="Tags dataset report")
tags_report.to_file("../reports/tags_profile_report.html")

### artists.dat
This file contains information about music artists listened and tagged by the users.

id \t name \t url \t pictureURL

Example:
707	Metallica	http://www.last.fm/music/Metallica	http://userserve-ak.last.fm/serve/252/7560709.jpg

In [None]:
artists = pd.read_csv(Path() / "../../data/raw/hetrec2011-lastfm-2k/artists.dat", sep="\t", encoding='latin-1')

In [None]:
artists_report = ProfileReport(artists, title="Artists dataset report")
artists_report.to_file("../reports/artists_profile_report.html")

### fixing skeweness of user_artists.weight

In [None]:
from sklearn.preprocessing import  QuantileTransformer

In [None]:
qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')

In [None]:
user_artists.weight = pd.Series((qt.fit_transform(np.array(user_artists.weight).reshape(-1, 1))).flatten())

In [None]:
sns.histplot(user_artists.weight, bins = 50, kde = True)

In [None]:
user_artists.loc[user_artists["weight"] >= 0, "weight"] = 1
user_artists.loc[user_artists["weight"] < 0, "weight"] = 0

In [None]:
user_artists

### deleting missing values from artists

In [None]:
artists = artists.dropna()

In [None]:
artists["id"].isin([1001, 993, 187, 997, 193])

In [None]:
artists.head()

In [7]:
user_taggedartists = pd.read_csv("/Users/polina/study/THESIS/recommender-system/data/raw/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat", sep="\t")

In [8]:
user_taggedartists

Unnamed: 0,userID,artistID,tagID,timestamp
0,2,52,13,1238536800000
1,2,52,15,1238536800000
2,2,52,18,1238536800000
3,2,52,21,1238536800000
4,2,52,41,1238536800000
...,...,...,...,...
186474,2100,16437,4,1277935200000
186475,2100,16437,292,1272664800000
186476,2100,16437,2087,1277935200000
186477,2100,16437,2801,1272664800000


In [32]:
grouped_df = user_taggedartists.groupby('artistID')['tagID'].agg(pd.Series.mode).to_frame()

In [33]:
grouped_df.iloc[0]

tagID    139
Name: 1, dtype: object

In [26]:
grouped_df.head(10)

Unnamed: 0_level_0,tagID
artistID,Unnamed: 1_level_1
1,139
2,575
3,4
4,139
5,575
6,7
7,1
8,139
9,61
10,86


In [21]:
# Get the top 5 tags for each artist
top_tags_df = grouped_df.apply(lambda x: x.sort_values(ascending=False).head(5).index.tolist(), axis=1)

# Rename columns and reset index for the final result
top_tags_df.columns = ['tag1', 'tag2', 'tag3', 'tag4', 'tag5']
top_tags_df.reset_index(inplace=True)

# Merge with the original DataFrame to get timestamps
result_df = pd.merge(top_tags_df, user_taggedartists[['artistID']], on='artistID')


TypeError: Cannot reset_index inplace on a Series to create a DataFrame