In [1]:
import ast
import os
import pandas as pd
import re

from dotenv import load_dotenv
from preprocessing import get_path, prep
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import chain

In [2]:
load_dotenv("variable.env")
artists_env = os.getenv("art_col")
genres_env = os.getenv("gen_col")
tags_env = os.getenv("tag_col")

df = pd.read_csv(filepath_or_buffer = get_path(f"{os.getenv('dataset')}.csv"))
artists = set(df[artists_env])
artists = {part for artist in artists for part in re.split(r"[,&]", artist)}
df[tags_env] = df[tags_env].fillna("[]")
df[tags_env], df[genres_env] = df[tags_env].apply(ast.literal_eval), df[genres_env].apply(ast.literal_eval)
genres = set(chain(*df[genres_env]))
genres = {part for genre in genres for part in (genre.split("-") if "-" in genre and genre != "r-n-b" else [genre])} - {"age", "alt", "n", "new"}

In [3]:
tag_tfidf = prep(df, {})

add = [(r"\b(\w+)(?: \1\b)+", r"\1")]
thresholds = [95, 80]

tag_tfidf.tag_prep(tags_env, artists, genres, "en_core_web_sm", "english", add, thresholds)
tag_tfidf.tags[:5]

[['raggae', '70', '70', 'root2', 'raggae', 'jamaican'],
 ['fun', '80', 'pop', 'prince', 'rok'],
 ['newb', 'wave', '80', 'rok', 'dinner', 'songwriter', 'pitchfork', '500'],
 ['saul', 'fun', '70', 'rnb', 'yellow'],
 ['dinner', 'songwriter', 'rok', 'newb', 'wave', 'sad', 'british']]

In [4]:
vectorizer = TfidfVectorizer(max_df = 0.1,
                             min_df = 10)
X = vectorizer.fit_transform([" ".join(sublist) for sublist in tag_tfidf.tags])
df_tags_tfidf = pd.DataFrame(data = X.toarray(),
                             columns = [f"tag_{tag}" for tag in vectorizer.get_feature_names_out()])
df_tags_tfidf = pd.merge(left = df[[os.getenv("tit_col"), artists_env]],
                         right = df_tags_tfidf,
                         left_index = True,
                         right_index = True)
df_tags_tfidf.head(5)

Unnamed: 0,name,artists,tag_00,tag_10,tag_1001819731063,tag_1001854528204,tag_123,tag_1980s,tag_20,tag_200,...,tag_yellow,tag_yes,tag_yolala,tag_york,tag_young,tag_youth,tag_yr,tag_zealand,tag_zone,tag_zucchero
0,i shot the sheriff,the wailers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,computer blue,prince,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,beyond belief,elvis costello & the attractions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,sweet thing,"rufus & chaka khan,chaka khan",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.577194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,i want you,elvis costello & the attractions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df_tags_tfidf.to_csv(f"{get_path('')}/tags_tfidf.csv",
                     index = False)