In [1]:
import pandas as pd
import numpy as np

from unidecode import unidecode
import re

# Training dataset

In [2]:
df_train_cat = pd.read_csv("dataset/df_train_cat_rs_42.csv", index_col=0)
df_train_num = pd.read_csv("dataset/df_train_num_rs_42.csv", index_col=0)

df_train_cat = pd.concat([df_train_num.iloc[:, 0], df_train_cat], axis=1)

## Text Cleaning

There are songs appearing multiple times.

In [3]:
df_train_cat["track_name"].value_counts()

track_name
Breathe                               18
Alive                                 14
Lost                                  13
Poison                                13
Dance Monkey                          12
                                      ..
Cold Hearted                           1
...Til the Cops Come Knockin'          1
Espelho Meu - Ao Vivo                  1
Sound Of The Future - Original Mix     1
Merry Go Round                         1
Name: count, Length: 16125, dtype: int64

An example of a song that appears multiple times in different albums and playlist.

In [4]:
df_train_cat.loc[df_train_cat[df_train_cat["track_name"] == "Poker Face"].index]

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
2892,6,6w1hZMlo2SVuq29liebhb8,Poker Face,Lady Gaga,482HqzMMfrkFXAGujtwFrm,Poker Face,2008-09-26,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,electropop,4,1
2134,55,70vvnTUamBXOc0vRk7BBDu,Poker Face,Lady Gaga,7j7iPq5rokadGr1ZdJRGgE,The Fame Monster (International Deluxe),2009-01-01,BALLARE - ÿ±ŸÇÿµ,1CMvQ4Yr5DlYvYzI0Vc2UE,pop,post-teen pop,4,1
30083,42,4ngLDnC9X0ARJATJTXljdC,Poker Face,Lady Gaga,5drTYB3sPmtgLTnj04FUad,The Fame,2008-01-01,Pop Hits 2000-2019,6mtYuOxzl58vSGnEDtZ9uB,edm,pop edm,4,1
2548,14,0le79px2kIBXQDxUe8ZUGP,Poker Face,Lady Gaga,67j3NJodNRI8USUwKwTZA6,The Fame Monster (International Deluxe),2009-01-01,Electropop Hits 2017-2020,7kyvBmlc1uSqsTL0EuNLrx,pop,electropop,4,1
20708,69,0WfKDYeUAoLA3vdvLKKWMW,Poker Face,Lady Gaga,2FBA8NCSuQNi8jaR2Xjbal,The Fame,2008-01-01,Today's Hits 2000-Present,6a66cg3HcsjYkisYyQcov6,latin,latin hip hop,4,1


Duplicate entries can add noise to the dataset and harm model performance, so we keep only one record by keeping the one with the highest track popularity.

In [5]:
mask = df_train_cat.groupby("track_name")["track_popularity"].idxmax()
df_train_cat = df_train_cat.loc[mask]

### 1. Song Title

We notice that some song titles contain sub-information, which are in '( ... )' or behind '-', this type of information is usually about the featuring artists, remixing artists, remastered songs, sub-titles, and etc.

In [6]:
# title ÁªìÂ∞æÂ∏¶ (...)
# title ÁªìÂ∞æÂ∏¶ (... feat ...) ÊòØ # title ÁªìÂ∞æÂ∏¶ (...) ÁöÑ subset
process_df = df_train_cat[df_train_cat["track_name"].str.contains(r"\([^()]*\)\s*$", na=False)]
process_df.sample(10, random_state=42)

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
6406,10,43IfdE75ficQgsI8QGBvwb,Mama/Show Love (feat. YBN Cordae),Logic,6GeHCNwwqMMUrpxuGTRYcf,Confessions of a Dangerous Mind,2019-05-10,This Is Logic,37i9dQZF1DWTIuAboZgTMf,rap,hip hop,0,0
11213,85,1wJRveJZLSb1rjhnUHQiv6,Swervin (feat. 6ix9ine),A Boogie Wit da Hoodie,3r5hf3Cj3EMh1C2saQ8jyt,Hoodie SZN,2018-12-21,Trap Americana,7tkgK1tm9hYkWp7EFyOcAr,rap,trap,9,1
8147,65,6HMHgBHdLBQ0QYIaOp2gse,I Don't Get Tired (#IDGT) (feat. August Alsina),Kevin Gates,4if4IsX36odNlcZF2kqgt3,Luca Brasi 2: Gangsta Grillz,2014-12-15,HIP&HOP,5DyJsJZOpMJh34WvUrQzMV,rap,southern hip hop,1,1
2924,3,48nmKZkd67ATPI8ROOJhkK,Lady (feat. Pitbull),Austin Mahone,0PMO0t7hroaEi9A6SX15qZ,For Me+You,2016-12-30,Electropop - Pop,5TiiHps0hNCyQ6ijVkNZQs,pop,electropop,10,0
31967,56,1BmdqDrBU5eohtRwHBls6C,joy. (R3HAB Remix),for KING & COUNTRY,2gGMD56Y8DAvIzpoYfA014,joy. (R3HAB Remix),2019-01-25,CHRISTIAN ELECTRO / DANCE / EDM,0MhTMIo1bgH6zzPh7BdChT,edm,progressive electro house,0,0
25612,46,2kYf8lZWXI4Ajd7ZhVhKhu,YOU'RE THE ONE (feat. Syd),KAYTRANADA,3XmoA9zwTpg4VZJCcOGkNZ,99.9%,2016-05-06,NEO-soul,3q3M4VCymcMoxJ3Tl7mRqN,r&b,neo soul,2,1
202,48,6pu5jQqenr37n0z6eBKOh0,Dreams Of You (feat. Rae Morris),Icarus,26et8vS0ynfWtbIFKKtZi4,Dreams Of You (feat. Rae Morris),2019-10-04,Dance Room,37i9dQZF1DX2ENAPP1Tyed,pop,dance pop,6,0
9325,54,7pf6YT5CmmQxs6Uz7dABNB,Doggy Dogg World (feat. The Dramatics & Tha Do...,Snoop Dogg,7f9KDGqY7X2VLBM5aA66KM,Doggystyle,1993,90's Gangster Rap,1g3APxk2mLVNU2TuHCPli0,rap,gangster rap,7,1
7356,66,1RjU7UhRICmdFPKxCucLgq,Hell on Earth (Front Lines),Mobb Deep,6BWf3fxsgSDhES4Cm4oyy5,Hell On Earth (Explicit),1996-11-19,‚ó§ Hip Hop Dance Music ‚Äì Urban ‚Äì Trap ‚Äì Breakin...,0Hr2h94pKN8QAGVAgD6BsD,rap,southern hip hop,0,1
2353,71,4YMqbFcDIFiCBd02PzUBcM,Thrift Shop (feat. Wanz),Macklemore & Ryan Lewis,6XO9dbsH9zhuQgFGH0hUrb,The Heist,2012-10-10,post teen pop,6rjxP7GQKoqqgoakzxl3PY,pop,post-teen pop,6,0


In [7]:
# title Â∏¶ '-' 
df_train_cat[df_train_cat["track_name"].str.contains(r"-", case=False, na=False)].sample(10, random_state=42)

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
24712,5,4piae89WoVnwucdJzWrB7M,O.P.P. (Re-Recorded),Naughty By Nature,6smksVgwxaPFGdSsIloCjo,Anthem Inc.,2011-12-13,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,r&b,new jack swing,4,0
7583,62,0jSMveIWvnhDIvzqN74Uc7,Tipsy - Club Mix,J-Kwon,4yTlfDLsW1dK2kG1qoNsrv,Hood Hop,2004,The Sound of Southern Hip Hop,18jT9NMRZifv6cMtK2jWD4,rap,southern hip hop,0,1
29014,12,2XjO02pppiExUbCmHyd94m,Universe - Radio Edit,THRML,2oSQAfsEnp4IYMXJomxP4x,Universe (Radio Edit),2019-10-18,Sick Big Room House Drops | EZUMI,4R4c3WdN1Any2Q1NSuec3r,edm,big room,2,0
12320,16,4cCQ0wBIc9YHhvOWvWNj7S,Hotel California - Live at The Los Angeles For...,Eagles,2A7TYsmTXaHAbQx6C3phKM,Hotel California (Live at The Los Angeles Foru...,1976,Rock Classics,37i9dQZF1DWXRqgorJj26U,rock,classic rock,2,1
12584,56,6n5lHdpuBDdA7pYDpiKzEI,Can't Get Enough - 2015 Remaster,Bad Company,4fuDpBLCt9ChDlqw48bj7p,Bad Company (Deluxe),1974,Classic Rock Workout,37i9dQZF1DWYNSm3Z3MxiM,rock,classic rock,0,1
28215,59,1q2yJEPlzwaJmDrXeof7z0,LaLaLaLaLa - Gaullin Remix,Dwin,1whtnfVa5yF1Lozl3UgKHE,LaLaLaLaLa (Gaullin Remix),2017-09-04,üîäBASSBOOSTEDüîä‚ö°ELECTRO HOUSE‚ö°üî•EDM CAR MUSIC2018...,4GSiiL8tcMgvoV7K1IADb8,edm,electro house,4,0
13565,52,36xEjbl8DtevPJgw6i9IuY,Feels like the First Time - 2008 Remaster,Foreigner,3RYi1mBYapOaGWQvwRjRjr,No End in Sight: The Very Best of Foreigner (E...,2008,Classic Rock Radio,4lIywN6kXl9KPm3OQ8u8G7,rock,classic rock,7,1
3550,4,08smDzfxppUCFABj6FU2Xh,Lebensborn - Update,[:SITD:],64Xx93nBRKzWuw2l26DC9p,Icon:Koru,2011-11-18,Gothic / Industrial / Mittelalter / EBM / Futu...,53CmFroG6MWR5reOOXJX6B,pop,electropop,8,1
13851,29,0s1PewZjIU8DJqxktjeZaX,"I Had a Dream, Joe - 2010 Remastered Version",Nick Cave & The Bad Seeds,5WtxUsdAkyi39GlR9yhD1k,Henry's Dream (2010 Remastered Version),1992,Permanent Wave,5Go0Jsxj1UnsU7Om841BEo,rock,permanent wave,7,0
11332,53,7kipZd4tWx6Mu8kBgB2Z2r,God - Remastered 2010,John Lennon,0DFYbYCcHCEJPcN1hODG6K,Plastic Ono Band,1970-12-11,The Black Album,0KNLLunbFh9XgitPMOGmQg,rock,album rock,4,0


We only care about the main title, so we remove all the unnecessary information.

Steps

1. Process the titles with '-'

2. Process the titles with '()'

In [8]:
df_train_cat["track_name"] = df_train_cat["track_name"].fillna("")
df_train_cat["track_album_name"] = df_train_cat["track_album_name"].fillna("")

df_train_cat["track_name"] = df_train_cat["track_name"].str.lower() # lowercase song title
df_train_cat["track_name"] = df_train_cat["track_name"].map(lambda x: unidecode(x)) # Â§ÑÁêÜÁ±ªËã±ÊñáletterÔºåËá≥Ëã±Êñá

print("Number of title with none English letter: ", df_train_cat["track_name"].str.contains(r"[^\x00-\x7F]+", regex=True).sum())

Number of title with none English letter:  0


In [9]:
# title Â∏¶ '-' ÁöÑ‰∏ÄÂÖ± 3591
process_df = df_train_cat[df_train_cat["track_name"].str.contains(r"-", case=False, na=False)]

print("\n rows: ", process_df.shape[0])
process_df.head()


 rows:  2645


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
10430,49,66HnjBqHKWinYBFkOBJkTs,$$$ - remix,Saramalacara,42mpC1T2KCU8gCSXwB8bm0,$$$ (Remix),2019-11-08,Trapperz Argentina,37i9dQZF1DWXrVH01e3PIE,rap,trap,1,0
14503,60,65NTcXUtOb27NHKQ4fAcw0,'39 - 2011 mix,Queen,75eP8LZolyNBpqIRyB5pvB,A Night At The Opera (Deluxe Edition 2011 Rema...,1975-11-21,I didn‚Äôt know perm stood for permanent (wave),3e6gYPyrTbaB8BWgSHCt5j,rock,permanent wave,8,1
16760,51,1VpoazEusm1ki8hoSK49ua,'74 - '75 (feat. susan tyler) - radio edit,Paolo Pellegrino,7IMYSlNJvGAztqUyPalK7C,'74 - '75 (feat. Susan Tyler),2019-02-08,Tropical House üèù 2020 Hits,2SRbIs0eBQwHeTP7kErjwo,latin,tropical,11,0
16104,1,6fZO4Uk2yM4MwrY13n2zBI,(don't fear) the reaper - single version,Blue √ñyster Cult,24atu8BmjlxBrHAS8aWQS1,The Singles Collection,2005,Hard Rock Classics 1967-1991 (Party Edition),1NIX36ZFWEtgXSbSNghoue,rock,hard rock,0,1
11758,76,2PzU4IB8Dr6mxV3lHuaG34,(i can't get no) satisfaction - mono version,The Rolling Stones,2Q5MwpTmtjscaS34mJFXQQ,Out Of Our Heads,1965-07-30,House Of The Rising Sun,1bMYfBHYBCRHY5LGkjlpSy,rock,album rock,2,1


In [10]:
pattern = r"-.*(mix|remaster|version|edit).*?$" # '- ... mix ...', '- ... remaster ...', '- ... version ...' , '- ... edit ...'
df_train_cat[df_train_cat["track_name"].str.contains(pattern, case = False, na = False)]

  df_train_cat[df_train_cat["track_name"].str.contains(pattern, case = False, na = False)]


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
10430,49,66HnjBqHKWinYBFkOBJkTs,$$$ - remix,Saramalacara,42mpC1T2KCU8gCSXwB8bm0,$$$ (Remix),2019-11-08,Trapperz Argentina,37i9dQZF1DWXrVH01e3PIE,rap,trap,1,0
14503,60,65NTcXUtOb27NHKQ4fAcw0,'39 - 2011 mix,Queen,75eP8LZolyNBpqIRyB5pvB,A Night At The Opera (Deluxe Edition 2011 Rema...,1975-11-21,I didn‚Äôt know perm stood for permanent (wave),3e6gYPyrTbaB8BWgSHCt5j,rock,permanent wave,8,1
16760,51,1VpoazEusm1ki8hoSK49ua,'74 - '75 (feat. susan tyler) - radio edit,Paolo Pellegrino,7IMYSlNJvGAztqUyPalK7C,'74 - '75 (feat. Susan Tyler),2019-02-08,Tropical House üèù 2020 Hits,2SRbIs0eBQwHeTP7kErjwo,latin,tropical,11,0
16104,1,6fZO4Uk2yM4MwrY13n2zBI,(don't fear) the reaper - single version,Blue √ñyster Cult,24atu8BmjlxBrHAS8aWQS1,The Singles Collection,2005,Hard Rock Classics 1967-1991 (Party Edition),1NIX36ZFWEtgXSbSNghoue,rock,hard rock,0,1
11758,76,2PzU4IB8Dr6mxV3lHuaG34,(i can't get no) satisfaction - mono version,The Rolling Stones,2Q5MwpTmtjscaS34mJFXQQ,Out Of Our Heads,1965-07-30,House Of The Rising Sun,1bMYfBHYBCRHY5LGkjlpSy,rock,album rock,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28826,36,6tLnjekLODkxkRV4fWRv9z,modern anxiety - banx & ranx remix,Josef Salvat,5IKaSJt6PKeXf6PdAP9icN,modern anxiety (Banx & Ranx Remix),2019-12-12,Dancefloor Beats,6mMk6QCzEgT3QGaCV1R4S5,edm,big room,2,1
14777,55,0zEKOcdfzNkvW1OFnazAQO,sugar honey ice & tea - edit,Bring Me The Horizon,2442fVHcs5gEuDXgRHespx,sugar honey ice & tea (Edit),2019-07-26,Rock Hard,37i9dQZF1DWWJOmJ7nRx0C,rock,hard rock,0,0
10160,43,7DqrY4tkiK3Nm1DLdYQ5NQ,wowowo - remix,chillwagon,07Qg40VH1l6gUBmsQb4n6e,wowowo - remix,2019-08-26,Trap strefa,37i9dQZF1DWY3UKS6jvnxr,rap,trap,11,0
426,16,1RJPz4eFFP7N6VVKMYUS7O,mizunonakahe - car tax remix,yu-re:meu,03sSeRZnThaQK7SSsHhTSy,Unstable Unstoppable,2020-01-11,Dance Pop: Japan,37i9dQZF1DXahYFr91pFvG,pop,dance pop,2,1


In [11]:
#'- ... mix ...', '- ... remaster ...', '- ... version ...' , '- ... edit ...' Âç† Â∏¶ '-' ÁöÑ proportion
print("Proportion of titles that have sub-title in the form of '- ... mix ...', '- ... remaster ...', '- ... version ...' , or '- ... edit ...': ",
      round(df_train_cat[df_train_cat["track_name"].str.contains(pattern, case = False, na = False)].shape[0] / df_train_cat[df_train_cat["track_name"].str.contains(r"-", case=False, na=False)].shape[0], 3)
      )

Proportion of titles that have sub-title in the form of '- ... mix ...', '- ... remaster ...', '- ... version ...' , or '- ... edit ...':  0.826


  round(df_train_cat[df_train_cat["track_name"].str.contains(pattern, case = False, na = False)].shape[0] / df_train_cat[df_train_cat["track_name"].str.contains(r"-", case=False, na=False)].shape[0], 3)


ÈúÄË¶ÅÁâπÊÆäÂ§ÑÁêÜÁöÑ‰∏ÄÁªÑtitl

In [12]:
process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)]

  process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)]


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
6421,81,5tz69p7tJuGPeMGwNTxYuV,1-800-273-8255,Logic,1HiN2YXZcc3EjmVZ4WjfBk,Everybody,2017-05-05,This Is Logic,37i9dQZF1DWTIuAboZgTMf,rap,hip hop,5,0
9567,1,7CMAwm7mEkZkkaBl82DicE,2 of amerikaz most wanted - (explicit),2Pac,3oaGekoJ37K0waoYjDzaRr,The Best of 2Pac - Pt. 1: Thug,2007-12-04,Gangsta Rap/90's Hip-Hop,62spXXfUxBed8nbd5xvH2O,rap,gangster rap,1,1
31192,33,49rr7j8CCRq5ntOSTScupk,3am - tr377,Eli Brown,15IcffEIE3gIHkpimAN8Xj,Toolroom Radio EP377 - Presented By Mark Knight,2017-06-16,Selected House,2JPzPB9jnvJLAYtmCbvZy8,edm,progressive electro house,0,1
3162,19,2upFJKo7EvWdMx5732mhuc,5 8 6 - live,New Order,3mZDjow2ZIxfWewSXum1d8,NOMC15 (Live),2017-12-01,Electropop And Play,7p30DzTAgW6OhspSXHTI88,pop,electropop,5,0
30346,73,01wavOnjFBNgdMWBFYIAP4,50 vezes - ao vivo,Sorriso Maroto,29zZdrNrxlZsolrlspfmDA,"Ao Cubo, Ao Vivo, Em Cores",2019-03-29,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26344,26,2QIxieRCegkCSS2nDsBFRG,Jun noha-tohamarinburu-,„Ç™„É°„Ç¨„Éà„É©„Ç§„Éñ,6HM0SPu1eIv721SbsO8l3o,River's Island,1984-03-21,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,4,0
11428,30,4twOuEvYNHNUc9dHUqhhUY,Lian niNi rete -crazy about you-,Junko Ohashi,10BIzys0QO4l9UQGay0iRI,THE BEST SONGS of JUNKO OHASHI,1998,City Pop 1985 „Ç∑„ÉÜ„Ç£„Éº„Éù„ÉÉ„Éó,3j2osvmecEao5nmo9jZ5df,rock,album rock,5,0
26323,30,2FavFnagG4fHny2f6GSyhr,Chun hatowaniMu Jue meru - ver.2,Mondo Grosso,3IhDlS0GsKJHNCcJA7xocM,‰ΩïÂ∫¶„Åß„ÇÇÊñ∞„Åó„ÅèÁîü„Åæ„Çå„Çã,2017-06-06,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,1,1
26362,24,59ec82BVYaafQKsam7SYId,Zhu *modera-to,Masayoshi Takanaka,19H1VDeB8hMPmpG9LVKtHM,SUPER COLLECTION ~EMI Years~,1985,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,1,0


Ëøô‰∫õ index ÊòØÈúÄË¶ÅÁã¨Á´ãÂ§ÑÁêÜÁöÑ

In [13]:
process_special_idx_lis = [24673, 5047, 9202, 13702, 6712, 5259, 7585, 11323, 11792, 9485, 32426, 2238, 26097, 1768, 26383, 6421, 410, 16902, 
                           11428, 13593, 29454, 7864, 12723, 23680, 14491, 4411, 27410, 9458, 8603, 15405, 20613, 11521, 7197, 4822, 8262, 8699, 
                           445, 19854, 15053, 18927, 14250, 5765, 9543, 25796, 10252, 23046, 754, 6158, 12026, 9551, 8991, 27330, 9696, 3235, 
                           19366, 9156, 9552, 9965, 12198, 14435, 6085, 25101, 818, 10976, 20648, 9479, 27092, 5785, 8963, 9533, 12861, 7995, 
                           30791, 9775, 16932, 8185, 6992, 6901, 3414, 11849, 20616, 5590, 442, 17709, 6317, 21121, 5782, 15463, 9160, 12046, 
                           26414, 12595, 24077, 6150, 27025, 8007, 24253]

len(process_special_idx_lis)

#remove_idx = [9202, 32426, 26097, 11521, 23046, 6158, 19366, 9156, 6992, 6317, 9160]
remove_idx = [9202, 32426, 26097, 6158, 19366, 9156, 6992, 6317, 9160,
              5259, 2238, 1768, 9458, 7197, 754, 27330, 14435, 30791, 9775, 16932, 8185, 442, 27025]
process_special_idx_lis = pd.Series(process_special_idx_lis)
process_special_idx_lis = process_special_idx_lis[~process_special_idx_lis.isin(remove_idx)].tolist()

In [14]:
process_df_sub = process_df[process_df["track_name"].str.contains(pattern, case = False, na = False)] # Áõ¥Êé•ÂéªÊéâ ‚Äò-‚Äô, row 3001
process_df_sub_1 = process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)] # ÈúÄÁâπÂú∞Â§ÑÁêÜ, row 590

  process_df_sub = process_df[process_df["track_name"].str.contains(pattern, case = False, na = False)] # Áõ¥Êé•ÂéªÊéâ ‚Äò-‚Äô, row 3001
  process_df_sub_1 = process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)] # ÈúÄÁâπÂú∞Â§ÑÁêÜ, row 590


Â§ÑÁêÜ 3410 row ÁöÑÈÉ®ÂàÜÔºåÁõ¥Êé•ÂéªÊéâ Âê´'-'ÂêéÈù¢ÁöÑÂÜÖÂÆπ

In [15]:
process_df_sub

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
10430,49,66HnjBqHKWinYBFkOBJkTs,$$$ - remix,Saramalacara,42mpC1T2KCU8gCSXwB8bm0,$$$ (Remix),2019-11-08,Trapperz Argentina,37i9dQZF1DWXrVH01e3PIE,rap,trap,1,0
14503,60,65NTcXUtOb27NHKQ4fAcw0,'39 - 2011 mix,Queen,75eP8LZolyNBpqIRyB5pvB,A Night At The Opera (Deluxe Edition 2011 Rema...,1975-11-21,I didn‚Äôt know perm stood for permanent (wave),3e6gYPyrTbaB8BWgSHCt5j,rock,permanent wave,8,1
16760,51,1VpoazEusm1ki8hoSK49ua,'74 - '75 (feat. susan tyler) - radio edit,Paolo Pellegrino,7IMYSlNJvGAztqUyPalK7C,'74 - '75 (feat. Susan Tyler),2019-02-08,Tropical House üèù 2020 Hits,2SRbIs0eBQwHeTP7kErjwo,latin,tropical,11,0
16104,1,6fZO4Uk2yM4MwrY13n2zBI,(don't fear) the reaper - single version,Blue √ñyster Cult,24atu8BmjlxBrHAS8aWQS1,The Singles Collection,2005,Hard Rock Classics 1967-1991 (Party Edition),1NIX36ZFWEtgXSbSNghoue,rock,hard rock,0,1
11758,76,2PzU4IB8Dr6mxV3lHuaG34,(i can't get no) satisfaction - mono version,The Rolling Stones,2Q5MwpTmtjscaS34mJFXQQ,Out Of Our Heads,1965-07-30,House Of The Rising Sun,1bMYfBHYBCRHY5LGkjlpSy,rock,album rock,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28826,36,6tLnjekLODkxkRV4fWRv9z,modern anxiety - banx & ranx remix,Josef Salvat,5IKaSJt6PKeXf6PdAP9icN,modern anxiety (Banx & Ranx Remix),2019-12-12,Dancefloor Beats,6mMk6QCzEgT3QGaCV1R4S5,edm,big room,2,1
14777,55,0zEKOcdfzNkvW1OFnazAQO,sugar honey ice & tea - edit,Bring Me The Horizon,2442fVHcs5gEuDXgRHespx,sugar honey ice & tea (Edit),2019-07-26,Rock Hard,37i9dQZF1DWWJOmJ7nRx0C,rock,hard rock,0,0
10160,43,7DqrY4tkiK3Nm1DLdYQ5NQ,wowowo - remix,chillwagon,07Qg40VH1l6gUBmsQb4n6e,wowowo - remix,2019-08-26,Trap strefa,37i9dQZF1DWY3UKS6jvnxr,rap,trap,11,0
426,16,1RJPz4eFFP7N6VVKMYUS7O,mizunonakahe - car tax remix,yu-re:meu,03sSeRZnThaQK7SSsHhTSy,Unstable Unstoppable,2020-01-11,Dance Pop: Japan,37i9dQZF1DXahYFr91pFvG,pop,dance pop,2,1


In [16]:
def strip_leading_paren(title: str) -> str:
    cleaned = re.sub(r"^\s*\([^)]*\)\s*", "", title)
    return cleaned


def clean_title_part_1(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.str.replace(r"\s*-\s*.*$", "", regex=True)
    column = column.map(strip_leading_paren) # ÂÖàÂéªÊéâ () Âú®ÂºÄÂ§¥ÁöÑtitle
    column = column.str.replace("'", "")
    column = column.str.replace(r"\s*\(.*$", "", regex=True) # ÂéªÊéâ ( ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"[?.,'&`]", "", regex=True) #  ÂéªÈô§ ? , . ' & ` 
    column = column.str.replace(r"[!:]$", "", regex=True) # ÂéªÊéâÁ¥ßË∑üÁöÑ ! Âíå :, : ÂåπÈÖç‰∏çÂà∞
    column = column.str.replace(r":", "", regex=True)
    column = column.str.replace(r"\s*\[[^\]]*\]", "", regex=True) # ÁßªÈô§ [...]
    column = column.str.replace(r"#\d+", "", regex=True) # ÁßªÈô§ #number

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™

    column = column.str.strip()

    return column

process_df_sub.loc[:, "track_name"] = clean_title_part_1(process_df_sub.loc[:, "track_name"])

ÂàÜ‰∏∫ special Âíå non-special

In [17]:
process_df_sub_1_special = process_df_sub_1.loc[process_special_idx_lis] # row 97
process_df_sub_1_non_special = process_df_sub_1.loc[~ process_df_sub_1.index.isin(process_special_idx_lis)] # row 493

In [18]:
process_df_sub_1_non_special

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
9567,1,7CMAwm7mEkZkkaBl82DicE,2 of amerikaz most wanted - (explicit),2Pac,3oaGekoJ37K0waoYjDzaRr,The Best of 2Pac - Pt. 1: Thug,2007-12-04,Gangsta Rap/90's Hip-Hop,62spXXfUxBed8nbd5xvH2O,rap,gangster rap,1,1
31192,33,49rr7j8CCRq5ntOSTScupk,3am - tr377,Eli Brown,15IcffEIE3gIHkpimAN8Xj,Toolroom Radio EP377 - Presented By Mark Knight,2017-06-16,Selected House,2JPzPB9jnvJLAYtmCbvZy8,edm,progressive electro house,0,1
3162,19,2upFJKo7EvWdMx5732mhuc,5 8 6 - live,New Order,3mZDjow2ZIxfWewSXum1d8,NOMC15 (Live),2017-12-01,Electropop And Play,7p30DzTAgW6OhspSXHTI88,pop,electropop,5,0
30346,73,01wavOnjFBNgdMWBFYIAP4,50 vezes - ao vivo,Sorriso Maroto,29zZdrNrxlZsolrlspfmDA,"Ao Cubo, Ao Vivo, Em Cores",2019-03-29,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,7,1
379,48,4suDPyEfKYtqDR2OGblbTi,9to4Fen no3Fan Xian deJun woDai tsu (run away)...,TOMORROW X TOGETHER,0q7bDr9lPLduHdECJOYZ8B,MAGIC HOUR,2020-01-14,Dance Pop: Japan,37i9dQZF1DXahYFr91pFvG,pop,dance pop,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9474,8,1D7FQQfKf4o3Xd2QTm1441,musi pusi 2 - fenomenbeats,–°–µ—Ä–µ–∂–∞ –ú–µ—Å—Ç–Ω—ã–π,3oG2yCj5yUHdKOUgKBoZPR,–ü–æ–ø–µ—Ä–µ–º–µ–Ω–Ω–æ,2013-11-11,RUSSIAN Gangster Rap,0Jw0HckkxCfIrOvpN081eV,rap,gangster rap,1,1
1030,37,5VRtz5coFkV9EqVLGEsg6k,garireo*ga-ru,NONA REEVES,1anEfewz5CYK3KQlQYysKc,Êú™Êù•,2019-03-13,Best of 2019 Dance Pop: Japan,37i9dQZF1DXdOtZGKonFlM,pop,dance pop,9,1
26344,26,2QIxieRCegkCSS2nDsBFRG,Jun noha-tohamarinburu-,„Ç™„É°„Ç¨„Éà„É©„Ç§„Éñ,6HM0SPu1eIv721SbsO8l3o,River's Island,1984-03-21,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,4,0
26323,30,2FavFnagG4fHny2f6GSyhr,Chun hatowaniMu Jue meru - ver.2,Mondo Grosso,3IhDlS0GsKJHNCcJA7xocM,‰ΩïÂ∫¶„Åß„ÇÇÊñ∞„Åó„ÅèÁîü„Åæ„Çå„Çã,2017-06-06,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,1,1


Â§ÑÁêÜ special

In [19]:
def clean_title_part_2(column): # pd df column, ÊõøÊç¢ '-' to ''
    column = column.str.replace("-", " ")
    column = column.str.replace(r"'s\b", "", regex=True) # "'s\b" Áî®ÂçïËØçËæπÁïåÁ°Æ‰øùÂè™ÂëΩ‰∏≠ÁúüÊ≠£ÁöÑÁº©ÂÜôÔºå‰∏ç‰ºöËØØÂà† songs ÈáåÁöÑ s„ÄÇ
    column = column.str.replace(r"[()?,.'/]", "", regex=True) #  ÂéªÈô§ ( ) ? , . 
    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

process_df_sub_1_special.loc[:,"track_name"] = clean_title_part_2(process_df_sub_1_special.loc[:,"track_name"])

Â§ÑÁêÜ non-special

In [20]:
def clean_title_part_3(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.str.replace(r"\s*-\s*.*$", "", regex=True)
    column = column.str.replace("'", "")
    column = column.str.replace(r"\s*\(.*$", "", regex=True) # ÂéªÊéâ ( ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"[?.,'&]", "", regex=True) #  ÂéªÈô§ ? , . ' & 
    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

process_df_sub_1_non_special.loc[:,"track_name"] = clean_title_part_3(process_df_sub_1_non_special.loc[:,"track_name"])

ËΩ¨ÁßªÂà∞ process_df

In [21]:
process_df.loc[process_df_sub.index, "track_name"] = process_df_sub["track_name"]
process_df.loc[process_df_sub_1_special.index, "track_name"] = process_df_sub_1_special["track_name"]
process_df.loc[process_df_sub_1_non_special.index, "track_name"] = process_df_sub_1_non_special["track_name"]

df_train_cat.loc[process_df.index, "track_name"] = process_df["track_name"] #ËΩ¨ÁßªÂà∞ df_train_cat

In [22]:
pattern = r"[^\w\s]"  # ÈùûÂ≠óÊØçÊï∞Â≠ó„ÄÅÈùûÁ©∫ÁôΩÁöÑÂ≠óÁ¨¶
punctu_lis = df_train_cat.loc[~ df_train_cat["track_name"].str.contains(r"[()]", regex=True), "track_name"].str.findall(pattern).explode().unique()
punctu_lis = punctu_lis[1:]
punctu_lis

array(['.', '#', '$', nan, "'", '*', '+', '/', '%', ':', '[', ']', '@',
       ',', '&', '!', '?', '`', '~', '|'], dtype=object)

In [23]:
def clean_title_part_4(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.map(strip_leading_paren) # ÂÖàÂéªÊéâ () Âú®ÂºÄÂ§¥ÁöÑtitle
    column = column.str.replace(r"\(([A-Za-z])\)", r"\1", regex=True) # d(r)own to drown
    column = column.str.replace(r"\s*\([^)]*\)", "", regex=True) # ÂéªÊéâ (...) ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"[)]+$", "", regex=True) # ÂéªÊéâ () ÊÆã‰Ωô
    column = column.str.replace(r"\s*\[[^\]]*\]", "", regex=True) # ÁßªÈô§ [...]
    column = column.str.replace(r"\s*\[.*$", "", regex=True) # ÁßªÈô§ÊÆãÁïô [...
    column = column.str.replace("'", "")
    column = column.str.replace(r"[?.!,&%+`\":~|<>]", "", regex=True) #  ÂéªÈô§ ? . ! , & % + " : ~ | < >
    column = column.str.replace(r"[/]", " ", regex=True) 
    column = column.str.replace(r"#\d+", "", regex=True) # ÂéªÈô§ # + Êï∞Â≠ó e.g. #1, #2, ...
    column = column.str.replace("#", "", regex=False) 

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

df_train_cat.loc[:, "track_name"] = clean_title_part_4(df_train_cat.loc[:, "track_name"])

In [24]:
punctu_lis

array(['.', '#', '$', nan, "'", '*', '+', '/', '%', ':', '[', ']', '@',
       ',', '&', '!', '?', '`', '~', '|'], dtype=object)

In [25]:
df_train_cat.loc[~ df_train_cat["track_name"].str.contains(r"[()]", regex=True), "track_name"].str.findall(pattern).explode().unique()

array([nan, '$', '*', '@'], dtype=object)

Â§ÑÁêÜ $

In [26]:
df_train_cat.loc[df_train_cat["track_name"].str.contains("$", regex=False), "track_name"]

10430                                            $$$
12685                                       $20 fine
5223                                         $dreams
23318                                 $ave dat money
10253                                           a$ap
8270                                    a$ap forever
18393                                      a$ian boy
22319                                      dat $tick
25756                                       love$ick
8316                                              m$
27548    midnight hour with boys noize ty dolla $ign
9259                          sie wollen meine loui$
3128                                           wing$
19515                                      cuanto e$
Name: track_name, dtype: object

In [27]:
skip_idx = [10430, 5223, 12685]

mask = ~df_train_cat.index.isin(skip_idx)
df_train_cat.loc[mask, "track_name"] = df_train_cat.loc[mask, "track_name"].str.replace("$", "s", regex=False)

#df_train_cat.loc[21143, "track_name"] = df_train_cat.loc[21143, "track_name"].replace(" ", "")
df_train_cat.loc[skip_idx[1:], "track_name"] = df_train_cat.loc[skip_idx[1:], "track_name"].str.replace("$", "", regex=False)

Â§ÑÁêÜ *

In [28]:
process_df = df_train_cat[df_train_cat["track_name"].str.contains("*", regex=False)]

In [29]:
def expand_stars(text):
    # ÊØèÂåπÈÖçÂà∞‰∏ÄÊÆµÔºårepl ‰ºöË¢´Ë∞ÉÁî®‰∏ÄÊ¨°Ôºåmatch Â∞±ÊòØËøôÊÆµÂåπÈÖçÂØπË±°
    # match.group(0) ÂèñÂá∫Êï¥ÊÆµÊòüÂè∑ÔºàÊØîÂ¶Ç "*", "**", "***"Ôºâ
    def repl(match): 
        stars = match.group(0)
        k = len(stars)
        return {1: "u", 2: "uc", 3: "uck"}.get(k, stars)

    return re.sub(r"\*{1,3}", repl, text)

pattern = r"f\*+(?:ck)?k?"
process_df.loc[process_df["track_name"].str.contains(pattern, regex=True), "track_name"]= process_df.loc[process_df["track_name"].str.contains(pattern, regex=True), "track_name"].map(expand_stars)

process_df.loc[6541, "track_name"] = re.sub(r"[*]", "i", process_df.loc[6541, "track_name"])
process_df.loc[[9206, 27122], "track_name"] = process_df.loc[[9206, 27122], "track_name"].str.replace(r"\*\*\*", "hit", regex=True)

process_df.loc[8790, "track_name"] = re.sub(r"\*\*", "gg", process_df.loc[8790, "track_name"])
process_df.loc[8782, "track_name"] = re.sub(r"\*\*\*\*", "igga", process_df.loc[8782, "track_name"])
process_df.loc[6183, "track_name"] = re.sub(r"[*]", "i", process_df.loc[6183, "track_name"])

process_df.loc[21930, "track_name"] = re.sub(r"[*]", "u", process_df.loc[21930, "track_name"])
process_df.loc[24620, "track_name"] = re.sub(r"(?<=a)\*\*", "ss", process_df.loc[24620, "track_name"])
process_df.loc[24620, "track_name"] = re.sub(r"\*\*", "it", process_df.loc[24620, "track_name"])
process_df.loc[25285, "track_name"] = re.sub(r"[*]", "i", process_df.loc[25285, "track_name"])

In [30]:
process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.replace("*", " ")
process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.strip()

df_train_cat.loc[process_df.index, "track_name"] = process_df["track_name"] #ËΩ¨ÁßªÂà∞ df_train_cat

@ ‰øùÁïô, df_train_cat text cleaning done.

### 2. Album name

In [31]:
df_train_cat.loc[:, "track_album_name"] = df_train_cat.loc[:, "track_album_name"].str.lower()
df_train_cat.loc[:, "track_album_name"] = df_train_cat.loc[:, "track_album_name"].map(lambda x: unidecode(x))

print("Number of album name with none English letter: ", df_train_cat["track_album_name"].str.contains(r"[^\x00-\x7F]+", regex=True).sum())

Number of album name with none English letter:  0


In [32]:
pattern = r"[^\w\s]" 
punctu_lis = df_train_cat["track_album_name"].str.findall(pattern).explode().unique()
#punctu_lis = punctu_lis[1:]
punctu_lis

array([nan, '#', '(', ')', '[', '.', ']', '$', "'", '-', ':', '&', '%',
       '|', '@', '*', '+', '?', '!', '/', ',', '"', '`', ';', '~', '>'],
      dtype=object)

In [33]:
def clean_album_name(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.map(strip_leading_paren) # ÂÖàÂéªÊéâ () Âú®ÂºÄÂ§¥ÁöÑtitle
    column = column.str.replace(r"\s*\([^)]*\)", "", regex=True) # ÂéªÊéâ (...) ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"\s*\(.*$", "", regex=True) # ÂéªÊéâ (... ÊÆãÁïô
    column = column.str.replace(r"\s*\[[^\]]*\]", "", regex=True) # ÁßªÈô§ [...]
    column = column.str.replace(r"\s*\[.*$", "", regex=True) # ÁßªÈô§ [... ÊÆãÁïô
    column = column.str.replace(r"\s*remix\)\s*$", "", regex=True) # ÂéªÊéâ remix)

    column = column.str.replace(r"(?<=\b[A-Za-z])\.(?=[A-Za-z]\b)", "", regex=True) # Â§ö‰∏ÄÊ≠•Â§ÑÁêÜ . , Â§ÑÁêÜ r.o.s.e ËøôÊ†∑ÁöÑ
    column = column.str.replace(r"[-:>/.]", " ", regex=True)
    column = column.str.replace(r'''[?!,'"%+`~|<>{}=)]''', "", regex=True)
    column = column.str.replace(r"#\d+", "", regex=True) # ÂéªÈô§ # + Êï∞Â≠ó e.g. #1, #2, ...
    column = column.str.replace("#", "", regex=False) 

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

df_train_cat.loc[:, "track_album_name"] = clean_album_name(df_train_cat.loc[:, "track_album_name"])

In [34]:
df_train_cat.loc[:, "track_album_name"].str.findall(pattern).explode().unique()

array([nan, '$', '&', '@', '*'], dtype=object)

$ *

‰∏ìÈó®Â§ÑÁêÜ &

@ ‰øùÁïô

Â§ÑÁêÜ &ÔºåÂè™‰øùÁïô r&b

In [35]:
mask = df_train_cat["track_album_name"].str.contains(r"(?i)r&b", regex=True)

df_train_cat.loc[~mask, "track_album_name"] = (
df_train_cat.loc[~mask, "track_album_name"]
       .str.replace("&", " ", regex=False)
     .str.replace(r"\s{2,}", " ", regex=True)
        .str.strip()
)

Â§ÑÁêÜ $, Êç¢Êàê s

In [36]:
skip_idx = [10430]

mask = ~df_train_cat.index.isin(skip_idx)
df_train_cat.loc[mask, "track_album_name"] = df_train_cat.loc[mask, "track_album_name"].str.replace("$", "s", regex=False)

Â§ÑÁêÜ *

In [37]:
process_df = df_train_cat[df_train_cat["track_album_name"].str.contains("*", regex=False)]

pattern = r"f\*+(?:ck)?k?"
process_df.loc[process_df["track_album_name"].str.contains(pattern, regex=True), "track_album_name"]= process_df.loc[process_df["track_album_name"].str.contains(pattern, regex=True), "track_album_name"].map(expand_stars)

process_df.loc[[26330, 6496, 1040, 26329], "track_album_name"] = process_df.loc[[26330, 6496, 1040, 26329], "track_album_name"].str.replace(r"[*]", " ", regex=True)

process_df.loc[[9206, 27122], "track_album_name"] = process_df.loc[[9206, 27122], "track_album_name"].str.replace(r"\*\*\*", "hit", regex=True)

process_df.loc[21930, "track_album_name"] = re.sub(r"[*]", "u", process_df.loc[21930, "track_album_name"])
process_df.loc[6183, "track_album_name"] = re.sub(r"[*]", "i", process_df.loc[6183, "track_album_name"])

df_train_cat.loc[process_df.index, "track_album_name"] = process_df.loc[:, "track_album_name"]

### 3. genre and sub-genre

In [38]:
def clean_genre(column): # Á°Æ‰øùÊ≤°ÊúâÂ§ö‰ΩôÁöÑÁ©∫Ê†º

    column = column.str.replace(r"[-]", " ", regex = True) # ÂØπ sub genre

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

df_train_cat.loc[:, "playlist_genre"] = clean_genre(df_train_cat.loc[:, "playlist_genre"])
df_train_cat.loc[:, "playlist_subgenre"] = clean_genre(df_train_cat.loc[:, "playlist_subgenre"])

### Concatenate All the Texts

In [39]:
text_cols = df_train_cat.columns[[2, 5, 9, 10]]

for i in text_cols:
    print(f"{i} na number: ", df_train_cat[i].isna().sum())
print("")
for i in text_cols:
    print(f"0 length string in {i}: ", 0 in df_train_cat[i].str.len().values)

track_name na number:  0
track_album_name na number:  0
playlist_genre na number:  0
playlist_subgenre na number:  0

0 length string in track_name:  True
0 length string in track_album_name:  True
0 length string in playlist_genre:  False
0 length string in playlist_subgenre:  False


In [40]:
df_train_cat["combined_text"] = (df_train_cat[text_cols].agg(" ".join, axis=1) # str concatenate
                                 .str.replace(r"\s{2,}", " ", regex=True)
                                 .str.strip())

### Output

In [41]:
df_train_cat.to_csv("dataset/df_train_cat_rs_42_processed.csv", index=True, encoding="utf-8")

# Testing dataset

In [42]:
df_test_cat = pd.read_csv("dataset/df_test_cat_rs_42.csv", index_col=0)
df_test_num = pd.read_csv("dataset/df_test_num_rs_42.csv", index_col=0)

df_test_cat = pd.concat([df_test_num.iloc[:, 0], df_test_cat], axis=1)

In [43]:
df_test_cat.head()

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
30056,45,697MjF1454XKvZmTuqkWmD,I Miss You,Jeriqo,2vJ6FDg6ZMS56U8Wbiw2Oz,I Miss You,2019-02-05,Pop Hits 2000-2019,6mtYuOxzl58vSGnEDtZ9uB,edm,pop edm,0,1
11827,17,3x2bXiU0o4WbsPkawXlfDA,Who Are You,The Who,6LRJF97hgXHj8uMLHyCDbh,Who Are You (Remastered),1978-08-18,House Of The Rising Sun,1bMYfBHYBCRHY5LGkjlpSy,rock,album rock,9,1
23571,30,0jEaPBjFAWjQTMVouRwaHi,Happy,The Beef Seeds,4IQn9XpweytNX2cUe2NBUH,Keepin' it Beefy,2015-01-19,Bluegrass Covers,37i9dQZF1DX56crgoe4TG3,r&b,hip pop,10,1
14741,35,5EKUb1FKsyYVaSXb41YBIj,ONE,Rev Theory,0gGic19XvEiHKKWBV7M4YM,ONE,2020-01-15,Rock Hard,37i9dQZF1DWWJOmJ7nRx0C,rock,hard rock,4,0
25570,62,300DUx4tdtCdGEUXR032jA,Palace/Curse,The Internet,69g3CtOVg98TPOwqmI2K7Q,Ego Death,2015-06-26,NEO-soul,3q3M4VCymcMoxJ3Tl7mRqN,r&b,neo soul,10,0


In [44]:
mask = df_test_cat.groupby("track_name")["track_popularity"].idxmax()
df_test_cat = df_test_cat.loc[mask]

## Text Cleaning

### 1. Song Title

In [45]:
# title ÁªìÂ∞æÂ∏¶ (...)
# title ÁªìÂ∞æÂ∏¶ (... feat ...) ÊòØ # title ÁªìÂ∞æÂ∏¶ (...) ÁöÑ subset
process_df = df_test_cat[df_test_cat["track_name"].str.contains(r"\([^()]*\)\s*$", na=False)]
process_df.sample(10, random_state=42)

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
23282,85,7ef4DlsgrMEH11cDZd32M6,One Kiss (with Dua Lipa),Calvin Harris,7GEzhoTiqcPYkOprWQu581,One Kiss (with Dua Lipa),2018-04-06,Hip pop,1Sc7bobknESH7SXQcnmoX5,r&b,hip pop,9,0
5953,72,0W9E3s2G4szLUwXsE17x5E,Beach Ballin' (feat. blackbear),Yung Pinch,1P6UQ3wWvVyFH7Y7cuDu7M,Beach Ballin' (feat. blackbear),2020-01-03,Hip-Hop Central,37i9dQZF1DWY6tYEFs22tT,rap,hip hop,1,0
5135,74,46lFttIf5hnUZMGvjK0Wxo,Runaway (U & I),Galantis,4QcXq4vTVN7dFb7bZa9jG2,Pharmacy,2015-06-05,Indie Poptimism,2QiMewRbSavfZ9MSAYz2h6,pop,indie poptimism,1,1
8658,45,48tybOd2kxWaMjnYrEWTvY,Tupac Back (feat. Rick Ross),Meek Mill,1xE3Obhzdpw4aIqPDMf4I4,"MMG Presents: Self Made, Vol. 1",2011-05-23,Gangster Rap Workout,0ZRwrJ2EDGyKR6YgQPWXeO,rap,gangster rap,1,0
20129,85,2IRZnDFmlqMuOrYOLnZZyc,Going Bad (feat. Drake),Meek Mill,6UYZEYjpN1DYRW0kqFy9ZE,Championships,2018-11-30,Global Top 50 | 2020 Hits,1KNl4AYfgZtOVm9KHkhPTF,latin,latin hip hop,4,0
747,10,3a94DkjR4idw3bp8yzai59,Mambo No. 5 (A Little Bit of...),Lou Bega,0F2v1g7RUEmK077TfZGeoS,Ultimate Holiday Hits,2008-07-25,90s Dance Hits,50cXvxgMGrvoUeLhkzCyIO,pop,dance pop,5,0
4981,30,689izEMmyU0vp280Y0JLMl,Lost (Lenno Remix),Scavenger Hunt,0llgzDj4NBSZ4i5dYHJw6d,Scavenger Hunt - EP,2015-01-09,Indie Poptimism!,4hKPJNFIhdAg4pAksn78FJ,pop,indie poptimism,0,1
345,50,0FC7eAz8CW9HAWnBr8Yo42,Daylight (Airbeat One Anthem 2018),Fluex,2Rd9MkQmupPCqDMpHWFapL,Daylight (Airbeat One Anthem 2018),2018-06-22,Pop Warmup 130 BPM,37i9dQZF1DX3PIAZMcbo2T,pop,dance pop,7,0
30041,81,5j1yOqWONR9T6l43AzJ6Es,Tip Toe (feat. A Boogie Wit da Hoodie),Roddy Ricch,0uyDZAeB7oa8CM6G9PjSOf,Tip Toe (feat. A Boogie Wit da Hoodie),2019-11-25,Pop Hits 2000-2019,6mtYuOxzl58vSGnEDtZ9uB,edm,pop edm,1,1
5550,82,2Xo3rLHjXfdb4avN68aQyi,BEST ON EARTH (feat. BIA),Russ,2TBJtc64Y76xWs98Mtko07,BEST ON EARTH (feat. BIA),2019-10-18,RapCaviar,37i9dQZF1DX0XUsuxWHRQd,rap,hip hop,5,0


In [46]:
# title Â∏¶ '-' 
df_test_cat[df_test_cat["track_name"].str.contains(r"-", case=False, na=False)].sample(10, random_state=42)

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
24699,28,6Gfq4pVJVnrsHVxkmFpo09,Make It Last Forever (with Jacci McGhee) - Ext...,Keith Sweat,6GXk2hxl4q5GoPHarlUet8,The Best of Keith Sweat: Make You Sweat,2004-01-13,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,r&b,new jack swing,7,1
29961,65,0P5lXk4fg0682IgDyK7GFM,I'm Blue - Club Mix,Kush Kush,785itgCjNtYOiRpu1SIDoo,I'm Blue (Club Mixes),2019-09-13,EDM 2020 House & Dance,25ButZrVb1Zj1MJioMs09D,edm,pop edm,0,0
27755,63,4Sp0jLZN8MgPnoGDufJS7N,On My Way - Da Tweekaz Remix,Alan Walker,1ODOCRQWwjfJRqJ34RtrKa,On My Way (Da Tweekaz Remix),2019-06-25,Crossfit‚Äè‚Äè‚Äã‚Äã ‚Äç,5GiPRvTccToqwOzkoAcDrY,edm,electro house,1,0
705,81,0Cn8NxJZz7zUlsaA3rXoIU,Easy - Remix,Jhay Cortez,3Y00aLNDKDq9rXR3cIcbDo,Easy (Remix),2019-11-19,Todo √âxitos,2ji5tRQVfnhaX1w9FhmSzk,pop,dance pop,1,0
26824,69,5JCi3pyggvk4B0yd0BK3ow,Stay High - Habits Remix,Tove Lo,5Z5O36p7BivXzkucc0PAfw,Queen Of The Clouds,2014-09-24,Electro House 2020,317O0e8iWJLClLGDKtieRe,edm,electro house,4,0
30332,69,76gCEZkHTzU8lY3aY6mFRM,Sumiu do Mapa - Ao Vivo,Lucas Lucco,5K1DtZURandKZhczQfHAMr,De Bar em Bar (Ao Vivo em Goi√¢nia),2019-10-18,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,7,1
13608,66,4igIYHF3B5VBxEafHauVo3,Fat Bottomed Girls - Remastered 2011,Queen,21HMAUrbbYSj9NiPPlGumy,Jazz (Deluxe Remastered Version),1978-11-10,Classic Rock Playlist.,7CESj1xKbxdArRzloygvkl,rock,classic rock,2,1
9163,24,6BX3nIZqQ9Ji0ZK8QCDpCs,Call 9-1-1,Westside Connection,5CdIIrp4pEVIVEJi5hAk1f,The Best Of Westside Connection,2007-11-30,Oldschool Gangsta Rap üëë,0t9TeyXE2mWe70JT3YwSoH,rap,gangster rap,2,0
32385,39,7yklZhCvqV4mWhaWv89hEO,Teardrops - Rapless Video Edit,Crew Cardinal,7Auv7o5h4h5oOgQUW7hkjK,Teardrops,2013-11-29,Gym (Melbourne Bounce/Progressive House),5BqOZpVEqRDfZScvW1QUyA,edm,progressive electro house,6,1
20575,30,015ljpgRzoOEwCEST5CGbb,Fascination (Re-Recorded / Remastered),Company B,0uM6dDXnSRPGhmxZonDKqT,"12"" Booty Shakin' Hits",2009-04-01,80's Freestyle/Disco Dance Party (Set Crossfad...,1oReEujyWpQv2OX68BVPPA,latin,latin hip hop,10,0


Same as processing training dataset

Ê†∏ÂøÉÔºöÂè™‰øùÁïôÊØèÈ¶ñÊ≠åÁöÑmain title

1. Â§ÑÁêÜ title Â∏¶ '-' ÁöÑ

2. Â§ÑÁêÜ title Â∏¶ '()'ÁöÑ

In [47]:
df_test_cat["track_name"] = df_test_cat["track_name"].fillna("")
df_test_cat["track_album_name"] = df_test_cat["track_album_name"].fillna("")

df_test_cat["track_name"] = df_test_cat["track_name"].str.lower() # lowercase song title
df_test_cat["track_name"] = df_test_cat["track_name"].map(lambda x: unidecode(x)) # Â§ÑÁêÜÁ±ªËã±ÊñáletterÔºåËá≥Ëã±Êñá

print("Number of title with none English letter: ", df_test_cat["track_name"].str.contains(r"[^\x00-\x7F]+", regex=True).sum())

Number of title with none English letter:  0


In [48]:
# title Â∏¶ '-' ÁöÑ‰∏ÄÂÖ± 879
process_df = df_test_cat[df_test_cat["track_name"].str.contains(r"-", case=False, na=False)]

print("\n rows: ", process_df.shape[0])
process_df.head()


 rows:  634


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
13295,1,6fZO4Uk2yM4MwrY13n2zBI,(don't fear) the reaper - single version,Blue √ñyster Cult,24atu8BmjlxBrHAS8aWQS1,The Singles Collection,2005,Workday: Rock Classics,37i9dQZF1DX1lwxXvReaux,rock,classic rock,0,1
12325,76,2PzU4IB8Dr6mxV3lHuaG34,(i can't get no) satisfaction - mono version,The Rolling Stones,2Q5MwpTmtjscaS34mJFXQQ,Out Of Our Heads,1965-07-30,Rock Classics,37i9dQZF1DWXRqgorJj26U,rock,classic rock,2,1
18115,56,4DCHszO57DZHgvqalfoITR,1973 - acoustic,James Blunt,3obQJSWpREwvi19TTAvM5v,All The Lost Souls,2007-09-17,Unplugged Hits üì£,5NTm3injIRkUMROsZr3C2O,latin,latin pop,9,1
26803,30,3J2gbESlzrz86jonlkE2Qg,2020 - radio edit,Chris Burke,35MzwNrVLXOUVoIsBSIiI2,2020,2019-12-31,Electro House 2020,317O0e8iWJLClLGDKtieRe,edm,electro house,5,0
3931,3,2sHY6sT0EomTon1pVVLwjF,2am - matoma remix,Astrid S,4e9bmpAF2A8gvImxaTUXil,2AM (Remixes),2015-05-12,Chillout & Remixes üíú,4NlAd9NpIa92IjErMyAriM,pop,indie poptimism,8,1


In [49]:
 # '- ... mix ...', '- ... remaster ...', '- ... version ...' , '- ... edit ...'

pattern = r"-.*(mix|remaster|version|edit).*?$"
df_test_cat[df_test_cat["track_name"].str.contains(pattern, case = False, na = False)]

  df_test_cat[df_test_cat["track_name"].str.contains(pattern, case = False, na = False)]


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
13295,1,6fZO4Uk2yM4MwrY13n2zBI,(don't fear) the reaper - single version,Blue √ñyster Cult,24atu8BmjlxBrHAS8aWQS1,The Singles Collection,2005,Workday: Rock Classics,37i9dQZF1DX1lwxXvReaux,rock,classic rock,0,1
12325,76,2PzU4IB8Dr6mxV3lHuaG34,(i can't get no) satisfaction - mono version,The Rolling Stones,2Q5MwpTmtjscaS34mJFXQQ,Out Of Our Heads,1965-07-30,Rock Classics,37i9dQZF1DWXRqgorJj26U,rock,classic rock,2,1
26803,30,3J2gbESlzrz86jonlkE2Qg,2020 - radio edit,Chris Burke,35MzwNrVLXOUVoIsBSIiI2,2020,2019-12-31,Electro House 2020,317O0e8iWJLClLGDKtieRe,edm,electro house,5,0
3931,3,2sHY6sT0EomTon1pVVLwjF,2am - matoma remix,Astrid S,4e9bmpAF2A8gvImxaTUXil,2AM (Remixes),2015-05-12,Chillout & Remixes üíú,4NlAd9NpIa92IjErMyAriM,pop,indie poptimism,8,1
21327,64,6IoKSUyNOOheJRjiuGb1ew,47 - remix,Anuel AA,0uwIB5LyXa2riRyWUwKiJA,47 (Remix),2017-03-31,Los Cangri,2hTs6GuPwhmg4ApNqxWXLE,latin,latin hip hop,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15978,72,36lkJSDyMEZoWTqtRj8Q8q,you're the inspiration - 2006 remaster,Chicago,1ICKrl6sDjJD1YdR9VDfPR,Chicago 17 (Expanded & Remastered),1984-05-14,Rock Ballads 80s 90s | Best Rock Love Songs 80...,0y8MUle9Lf8VDNjPsskzJB,rock,hard rock,11,1
30049,52,3T7ZHIgO30bKeASQppPtM6,young ones - rudelies remix,Avenza,7ilK2JYZS3cuHLRbu91JWM,Young Ones (RudeLies Remix) [feat. Johnning],2019-10-11,Pop Hits 2000-2019,6mtYuOxzl58vSGnEDtZ9uB,edm,pop edm,4,0
31127,13,1rP7ICGrYCqWYwB7SnblXd,your soul - original mix,Gary Caos,4DbWQ5CLRbo8GwmC2Z1NED,Your Soul,2019-02-15,Selected House,2JPzPB9jnvJLAYtmCbvZy8,edm,progressive electro house,3,0
29423,51,7qfZTBNZQqvg0hMamrAQB1,zombie - bassjackers remix,Ran-D,7DnPC3dlNFBRgoqMPYGswZ,Zombie (Remixes),2019-01-11,Big Room 2019,6SrHyxIxWfQx9ISEr6yowk,edm,big room,4,0


In [50]:
#'- ... mix ...', '- ... remaster ...', '- ... version ...' , '- ... edit ...' Âç† Â∏¶ '-' ÁöÑ proportion
print("Proportion of titles that have sub-title in the form of '- ... mix ...', '- ... remaster ...', '- ... version ...' , or '- ... edit ...': ",
      round(df_test_cat[df_test_cat["track_name"].str.contains(pattern, case = False, na = False)].shape[0] / df_test_cat[df_test_cat["track_name"].str.contains(r"-", case=False, na=False)].shape[0], 3)
      )

Proportion of titles that have sub-title in the form of '- ... mix ...', '- ... remaster ...', '- ... version ...' , or '- ... edit ...':  0.834


  round(df_test_cat[df_test_cat["track_name"].str.contains(pattern, case = False, na = False)].shape[0] / df_test_cat[df_test_cat["track_name"].str.contains(r"-", case=False, na=False)].shape[0], 3)


ÈúÄË¶ÅÁâπÊÆäÂ§ÑÁêÜÁöÑ‰∏ÄÁªÑtitle

In [51]:
process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)]

  process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)]


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
18115,56,4DCHszO57DZHgvqalfoITR,1973 - acoustic,James Blunt,3obQJSWpREwvi19TTAvM5v,All The Lost Souls,2007-09-17,Unplugged Hits üì£,5NTm3injIRkUMROsZr3C2O,latin,latin pop,9,1
30370,82,1uryupl9hqVZYurJwH4G9k,3 batidas - ao vivo,Guilherme & Benuto,2s2XNO2C38JP3NVOp8zkm4,3 Batidas (Ao Vivo),2019-08-16,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,8,1
7880,18,507rhTuNu5YPpxQTT287Y2,30's-n-lows,Bass Patrol,6LcsgWx1N0HZHlVLgdZw7N,The Kings Of Bass,2004,Minitruckin Playlist,0VVH2Nzj6kBVGK3WIUQMAw,rap,southern hip hop,1,1
13069,43,00MI0oGDVJYM1qWbyUOIhH,867-5309 / jenny,Tommy Tutone,6lqQzf2MGsTQ577cGXIfcK,867-5309 / Jenny,2008-04-09,80s / Classic Rock,29dTrOurPDrMcrnio2q6hZ,rock,classic rock,2,1
30358,78,4gDvg62vG96w0qJaF6wzcd,a culpa e do meu grau (feat. ze neto & cristia...,Diego & Victor Hugo,2lmLfahVMp5C3w9HW5TxbL,Diego & Victor Hugo Ao Vivo em Bras√≠lia - EP1,2019-03-22,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14276,32,3rINKmrSbky6jUVEvNtIX8,you're in the army now - status quo in concert,Status Quo,5Nx780igKtyoLCLAxPorqE,Live At The BBC,2010-01-01,Permanent Wave,6dqoVI7qwYLyNqbg0MPad3,rock,permanent wave,4,0
12865,37,0DgevHFzyhzIWN8hzrO98C,young lust - live,Roger Waters,5iokLBTiQQAh5zdUdsj3Gp,Roger Waters The Wall,2015-11-20,Classic Rock Legends,3NcxM1LJJdua8AcRxtijNY,rock,classic rock,2,1
18566,65,0ePiYPlCq3dhwfAjUttjT2,el era perfecto - en vivo,Mar√≠a Jos√©,3hBpaFjb8x2JvzTvEdaf5J,Conexi√≥n (En Vivo),2019-06-07,Exitos 2020 - Latin Billboard,0k2IDDkcVyIpIgE6d1woDM,latin,latin pop,8,1
26308,44,6AKoio5a5qRiIK6QvYJJrr,reidei*buru-su,LUCKY TAPES,42eeqAnEc2zjh8ND4IrVb6,CIGARETTE & ALCOHOL,2016-07-06,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,2,0


Ëøô‰∫õ index ÊòØÈúÄË¶ÅÁã¨Á´ãÂ§ÑÁêÜÁöÑ

In [52]:
process_special_idx_lis = [13069, 25834, 11401, 9779, 6039, 5638, 8045, 12649, 26710, 26308, 10643, 7231, 397, 18497, 16481,
                           14874, 7880, 9163, 26332, 7822, 17235, 26362, 13943, 25752, 8958, 4634, 32459
                           ]
len(process_special_idx_lis)

#remove_idx = [8958, 32459]
remove_idx = [8958, 32459, 26362, 13943, 25752, 4634]
process_special_idx_lis = pd.Series(process_special_idx_lis)
process_special_idx_lis = process_special_idx_lis[~process_special_idx_lis.isin(remove_idx)].tolist()

In [53]:
process_df_sub = process_df[process_df["track_name"].str.contains(pattern, case = False, na = False)] # Áõ¥Êé•ÂéªÊéâ ‚Äò-‚Äô, row 741
process_df_sub_1 = process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)] # ÈúÄÁâπÂú∞Â§ÑÁêÜ, row 138

  process_df_sub = process_df[process_df["track_name"].str.contains(pattern, case = False, na = False)] # Áõ¥Êé•ÂéªÊéâ ‚Äò-‚Äô, row 741
  process_df_sub_1 = process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)] # ÈúÄÁâπÂú∞Â§ÑÁêÜ, row 138


Â§ÑÁêÜ 741 row ÁöÑÈÉ®ÂàÜÔºåÁõ¥Êé•ÂéªÊéâ Âê´'-'ÂêéÈù¢ÁöÑÂÜÖÂÆπ

In [54]:
process_df_sub

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
13295,1,6fZO4Uk2yM4MwrY13n2zBI,(don't fear) the reaper - single version,Blue √ñyster Cult,24atu8BmjlxBrHAS8aWQS1,The Singles Collection,2005,Workday: Rock Classics,37i9dQZF1DX1lwxXvReaux,rock,classic rock,0,1
12325,76,2PzU4IB8Dr6mxV3lHuaG34,(i can't get no) satisfaction - mono version,The Rolling Stones,2Q5MwpTmtjscaS34mJFXQQ,Out Of Our Heads,1965-07-30,Rock Classics,37i9dQZF1DWXRqgorJj26U,rock,classic rock,2,1
26803,30,3J2gbESlzrz86jonlkE2Qg,2020 - radio edit,Chris Burke,35MzwNrVLXOUVoIsBSIiI2,2020,2019-12-31,Electro House 2020,317O0e8iWJLClLGDKtieRe,edm,electro house,5,0
3931,3,2sHY6sT0EomTon1pVVLwjF,2am - matoma remix,Astrid S,4e9bmpAF2A8gvImxaTUXil,2AM (Remixes),2015-05-12,Chillout & Remixes üíú,4NlAd9NpIa92IjErMyAriM,pop,indie poptimism,8,1
21327,64,6IoKSUyNOOheJRjiuGb1ew,47 - remix,Anuel AA,0uwIB5LyXa2riRyWUwKiJA,47 (Remix),2017-03-31,Los Cangri,2hTs6GuPwhmg4ApNqxWXLE,latin,latin hip hop,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15978,72,36lkJSDyMEZoWTqtRj8Q8q,you're the inspiration - 2006 remaster,Chicago,1ICKrl6sDjJD1YdR9VDfPR,Chicago 17 (Expanded & Remastered),1984-05-14,Rock Ballads 80s 90s | Best Rock Love Songs 80...,0y8MUle9Lf8VDNjPsskzJB,rock,hard rock,11,1
30049,52,3T7ZHIgO30bKeASQppPtM6,young ones - rudelies remix,Avenza,7ilK2JYZS3cuHLRbu91JWM,Young Ones (RudeLies Remix) [feat. Johnning],2019-10-11,Pop Hits 2000-2019,6mtYuOxzl58vSGnEDtZ9uB,edm,pop edm,4,0
31127,13,1rP7ICGrYCqWYwB7SnblXd,your soul - original mix,Gary Caos,4DbWQ5CLRbo8GwmC2Z1NED,Your Soul,2019-02-15,Selected House,2JPzPB9jnvJLAYtmCbvZy8,edm,progressive electro house,3,0
29423,51,7qfZTBNZQqvg0hMamrAQB1,zombie - bassjackers remix,Ran-D,7DnPC3dlNFBRgoqMPYGswZ,Zombie (Remixes),2019-01-11,Big Room 2019,6SrHyxIxWfQx9ISEr6yowk,edm,big room,4,0


In [55]:
def strip_leading_paren(title: str) -> str:
    cleaned = re.sub(r"^\s*\([^)]*\)\s*", "", title)
    return cleaned


def clean_title_part_1(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.str.replace(r"\s*-\s*.*$", "", regex=True)
    column = column.map(strip_leading_paren) # ÂÖàÂéªÊéâ () Âú®ÂºÄÂ§¥ÁöÑtitle
    column = column.str.replace("'", "")
    column = column.str.replace(r"\s*\(.*$", "", regex=True) # ÂéªÊéâ ( ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"\*\*[^*]*\*\*", "", regex=True) # ÂéªÊéâ ** ... **
    column = column.str.replace(r"[?.,'&`]", "", regex=True) #  ÂéªÈô§ ? , . ' & ` 
    column = column.str.replace("feat", "", regex=False)
    column = column.str.replace(r"[!:]$", "", regex=True) # ÂéªÊéâÁ¥ßË∑üÁöÑ ! Âíå :, : ÂåπÈÖç‰∏çÂà∞
    column = column.str.replace(r"!", "", regex=True)
    column = column.str.replace(r"\s*\[[^\]]*\]", "", regex=True) # ÁßªÈô§ [...]
    column = column.str.replace(r"#", "", regex=True) # ÁßªÈô§ #number

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™

    column = column.str.strip()

    return column

process_df_sub.loc[:, "track_name"] = clean_title_part_1(process_df_sub.loc[:, "track_name"])

ÂàÜ‰∏∫ special Âíå non-special

In [56]:
process_df_sub_1_special = process_df_sub_1.loc[process_special_idx_lis] # row 27
process_df_sub_1_non_special = process_df_sub_1.loc[~ process_df_sub_1.index.isin(process_special_idx_lis)] # row 111

Â§ÑÁêÜ special

In [57]:
process_df_sub_1_special.loc[7231, "track_name"] = process_df_sub_1_special.loc[7231, "track_name"].replace("---", "uck")
process_df_sub_1_special.loc[397, "track_name"] = process_df_sub_1_special.loc[397, "track_name"].replace("-", "")
process_df_sub_1_special.loc[9163, "track_name"] = process_df_sub_1_special.loc[9163, "track_name"].replace("-", "")
def clean_title_part_2(column): # pd df column, ÊõøÊç¢ '-' to ''
    column = column.str.replace("-", " ")
    column = column.str.replace(r"'s\b", "", regex=True) # "'s\b" Áî®ÂçïËØçËæπÁïåÁ°Æ‰øùÂè™ÂëΩ‰∏≠ÁúüÊ≠£ÁöÑÁº©ÂÜôÔºå‰∏ç‰ºöËØØÂà† songs ÈáåÁöÑ s„ÄÇ
    column = column.str.replace(r"[:*]", " ", regex=True)
    column = column.str.replace(r"[,.'/]", "", regex=True) #  ÂéªÈô§ ( ) ? , . 
    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

process_df_sub_1_special.loc[:,"track_name"] = clean_title_part_2(process_df_sub_1_special.loc[:,"track_name"])

Â§ÑÁêÜ non-special

In [58]:
def clean_title_part_3(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.str.replace(r"\s*-\s*.*$", "", regex=True)
    column = column.str.replace("'", "")
    column = column.str.replace(r"\s*\(.*$", "", regex=True) # ÂéªÊéâ ( ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"[?.,'&!]", "", regex=True) #  ÂéªÈô§ ? , . ' & 
    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

process_df_sub_1_non_special.loc[:,"track_name"] = clean_title_part_3(process_df_sub_1_non_special.loc[:,"track_name"])

ËΩ¨ÁßªÂà∞ process_df

In [59]:
process_df.loc[process_df_sub.index, "track_name"] = process_df_sub["track_name"]
process_df.loc[process_df_sub_1_special.index, "track_name"] = process_df_sub_1_special["track_name"]
process_df.loc[process_df_sub_1_non_special.index, "track_name"] = process_df_sub_1_non_special["track_name"]

df_test_cat.loc[process_df.index, "track_name"] = process_df["track_name"] #ËΩ¨ÁßªÂà∞ df_train_cat

In [60]:
pattern = r"[^\w\s]"  # ÈùûÂ≠óÊØçÊï∞Â≠ó„ÄÅÈùûÁ©∫ÁôΩÁöÑÂ≠óÁ¨¶
punctu_lis = df_test_cat.loc[~ df_test_cat["track_name"].str.contains(r"[()]", regex=True), "track_name"].str.findall(pattern).explode().unique()
punctu_lis = punctu_lis[1:]
punctu_lis

array(["'", nan, '.', '/', '?', ',', '$', ':', '&', '!', '@', '+', '*',
       '[', ']'], dtype=object)

In [61]:
def clean_title_part_4(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.map(strip_leading_paren) # ÂÖàÂéªÊéâ () Âú®ÂºÄÂ§¥ÁöÑtitle
    #column = column.str.replace(r"\(([A-Za-z])\)", r"\1", regex=True) # d(r)own to drown
    column = column.str.replace(r"\s*\([^)]*\)", "", regex=True) # ÂéªÊéâ (...) ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"\s*&[^)]*\)$", ")", regex=True) # ÂéªÊéâÁâπÊÆäÁöÑ‰∏ÄË°å & ...
    column = column.str.replace(r"[)]+$", "", regex=True) # ÂéªÊéâ () ÊÆã‰Ωô
    column = column.str.replace(r"'s\b", "", regex=True) # "'s\b" Áî®ÂçïËØçËæπÁïåÁ°Æ‰øùÂè™ÂëΩ‰∏≠ÁúüÊ≠£ÁöÑÁº©ÂÜôÔºå‰∏ç‰ºöËØØÂà† songs ÈáåÁöÑ s„ÄÇ
    
    column = column.str.replace(r"\s*\[[^\]]*\]", "", regex=True) # ÁßªÈô§ [...]
    column = column.str.replace("feat.", "", regex=False)
    
    #column = column.str.replace(r"\s*\[.*$", "", regex=True) # ÁßªÈô§ÊÆãÁïô [...
    column = column.str.replace("'", "")
    column = column.str.replace(r"[?.!,&%+`\":~|<>=]", "", regex=True) #  ÂéªÈô§ ? . ! , & % + " : ~ | < >
    column = column.str.replace(r"[/]", " ", regex=True) 
    column = column.str.replace(r"#\d+", "", regex=True) # ÂéªÈô§ # + Êï∞Â≠ó e.g. #1, #2, ...
    column = column.str.replace("#", "", regex=False) 

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

df_test_cat.loc[:, "track_name"] = clean_title_part_4(df_test_cat.loc[:, "track_name"])

In [62]:
df_test_cat.loc[~ df_test_cat["track_name"].str.contains(r"[()]", regex=True), "track_name"].str.findall(pattern).explode().unique()

array([nan, '$', '@', '*'], dtype=object)

Â§ÑÁêÜ $

In [63]:
df_test_cat.loc[df_test_cat["track_name"].str.contains("$", regex=False), "track_name"]

22266              $100
6148     $ave dat money
21524         a$ian boy
29477              ca$h
6328           no tru$t
Name: track_name, dtype: object

In [64]:
#mask = [6328, 29477, 6148, 21524, 3128]
mask = [6328, 29477, 6148, 21524]
df_test_cat.loc[mask, "track_name"] = df_test_cat.loc[mask, "track_name"].str.replace("$", "s", regex=False)

df_test_cat.loc[22266, "track_name"] = df_test_cat.loc[22266, "track_name"].replace("$", "")

Â§ÑÁêÜ *

In [65]:
process_df = df_test_cat[df_test_cat["track_name"].str.contains("*", regex=False)]

In [66]:
process_df.loc[7348, "track_name"] = re.sub(r"\*\*", "gg", process_df.loc[7348, "track_name"])
process_df.loc[8639, "track_name"] = re.sub(r"\*\*", "uc", process_df.loc[8639, "track_name"])
process_df.loc[8762, "track_name"] = re.sub(r"\*\*\*\*", "igga", process_df.loc[8762, "track_name"])
process_df.loc[26334, "track_name"] = process_df.loc[26334, "track_name"].replace("*", " ")
process_df.loc[26328, "track_name"] = process_df.loc[26328, "track_name"].replace("*", " ")
process_df.loc[8085, "track_name"] = re.sub(r"\*\*", "gg", process_df.loc[8085, "track_name"])
process_df.loc[26320, "track_name"] = process_df.loc[26320, "track_name"].replace("*", " ")
#process_df.loc[9693, "track_name"] = process_df.loc[9693, "track_name"].replace("*", "")
#process_df.loc[5407, "track_name"] = process_df.loc[5407, "track_name"].replace("*", "")
#process_df.loc[26427, "track_name"] = process_df.loc[26427, "track_name"].replace("*", "")

process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.strip()

df_test_cat.loc[process_df.index, "track_name"] = process_df["track_name"] #ËΩ¨ÁßªÂà∞ df_test_cat

Â§ÑÁêÜ@ 

In [67]:
#df_test_cat.loc[1827, "track_name"] = df_test_cat.loc[1827, "track_name"].replace("@", "")

df_test_cat.loc[:,"track_name"] = df_test_cat.loc[:,"track_name"].str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
df_test_cat.loc[:,"track_name"] = df_test_cat.loc[:,"track_name"].str.strip()

### 2. Album name

In [68]:
df_test_cat.loc[:, "track_album_name"] = df_test_cat.loc[:, "track_album_name"].str.lower()
df_test_cat.loc[:, "track_album_name"] = df_test_cat.loc[:, "track_album_name"].map(lambda x: unidecode(x))

print("Number of album name with none English letter: ", df_test_cat["track_album_name"].str.contains(r"[^\x00-\x7F]+", regex=True).sum())

Number of album name with none English letter:  0


In [69]:
pattern = r"[^\w\s]" 
punctu_lis = df_test_cat["track_album_name"].str.findall(pattern).explode().unique()
#punctu_lis = punctu_lis[1:]
punctu_lis

array(['#', '(', ')', nan, "'", '.', '-', '&', ':', ',', '[', ']', '/',
       '!', '$', '"', '?', '=', '@', '+', '`', '*', '<', '>', '%', '|'],
      dtype=object)

In [70]:
def clean_album_name(column): # pd df column, ÊõøÊç¢ '-' to ''

    column = column.map(strip_leading_paren) # ÂÖàÂéªÊéâ () Âú®ÂºÄÂ§¥ÁöÑtitle
    column = column.str.replace(r"\s*\([^)]*\)", "", regex=True) # ÂéªÊéâ (...) ‰ª•ÂèäÂêéÈù¢ÁöÑÂÜÖÂÆπ
    column = column.str.replace(r"'s\b", "", regex=True)
    column = column.str.replace(r"\s*\(.*$", "", regex=True) # ÂéªÊéâ (... ÊÆãÁïô
    column = column.str.replace(r"\s*\[[^\]]*\]", "", regex=True) # ÁßªÈô§ [...]
    column = column.str.replace(r"\s*\[.*$", "", regex=True) # ÁßªÈô§ [... ÊÆãÁïô
    #column = column.str.replace(r"\s*remix\)\s*$", "", regex=True) # ÂéªÊéâ remix)

    column = column.str.replace(r"(?<=\b[A-Za-z])\.(?=[A-Za-z]\b)", "", regex=True) # Â§ö‰∏ÄÊ≠•Â§ÑÁêÜ . , Â§ÑÁêÜ r.o.s.e ËøôÊ†∑ÁöÑ
    column = column.str.replace(r"[-:>/.]", " ", regex=True)
    column = column.str.replace(r'''[?!,'"%+`~|<>{}=)]''', "", regex=True)
    column = column.str.replace(r"#\d+", "", regex=True) # ÂéªÈô§ # + Êï∞Â≠ó e.g. #1, #2, ...
    column = column.str.replace("#", "", regex=False) 

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

df_test_cat.loc[:, "track_album_name"] = clean_album_name(df_test_cat.loc[:, "track_album_name"])

In [71]:
df_test_cat.loc[:, "track_album_name"].str.findall(pattern).explode().unique()

array([nan, '&', '$', '@', '*'], dtype=object)

$ *

‰∏ìÈó®Â§ÑÁêÜ &

@ ‰øùÁïô

Â§ÑÁêÜ &ÔºåÂè™‰øùÁïô r&b

In [72]:
mask = df_test_cat["track_album_name"].str.contains(r"(?i)r&b", regex=True)

df_test_cat.loc[~mask, "track_album_name"] = (
df_test_cat.loc[~mask, "track_album_name"]
       .str.replace("&", " ", regex=False)
     .str.replace(r"\s{2,}", " ", regex=True)
        .str.strip()
)

Â§ÑÁêÜ $, Êç¢Êàê s

In [73]:
#mask = [24662, 29477, 21524, 8317, 8317, 9929, 6087, 9926]
#mask = [29477, 21524, 8317, 8317, 9929, 6087, 9926]
mask = [29477, 21524, 8317, 8317, 9929, 6087]
df_test_cat.loc[mask, "track_album_name"] = df_test_cat.loc[mask, "track_album_name"].str.replace("$", "s", regex=False)

df_test_cat.loc[23943, "track_album_name"] = df_test_cat.loc[23943, "track_album_name"].replace("b4 da $$", "bad ass")

Â§ÑÁêÜ *

In [74]:
process_df = df_test_cat[df_test_cat["track_album_name"].str.contains("*", regex=False)]

process_df.loc[7348, "track_album_name"] = re.sub(r"\*\*", "gg", process_df.loc[7348, "track_album_name"])
process_df.loc[8762, "track_album_name"] = re.sub(r"\*\*\*\*", "igga", process_df.loc[8762, "track_album_name"])
process_df.loc[26328, "track_album_name"] = process_df.loc[26328, "track_album_name"].replace("*", " ")
process_df.loc[9040, "track_album_name"] = re.sub(r"\*\*\*", "igg", process_df.loc[9040, "track_album_name"])
#process_df.loc[5407, "track_album_name"] = process_df.loc[5407, "track_album_name"].replace("*", "")
#process_df.loc[26427, "track_album_name"] = process_df.loc[26427, "track_album_name"].replace("*", "")

process_df.loc[:,"track_album_name"] = process_df.loc[:,"track_album_name"].str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
process_df.loc[:,"track_album_name"] = process_df.loc[:,"track_album_name"].str.strip()

df_test_cat.loc[process_df.index, "track_album_name"] = process_df["track_album_name"] #ËΩ¨ÁßªÂà∞ df_test_cat

### 3. genre and sub-genre

In [75]:
def clean_genre(column): # Á°Æ‰øùÊ≤°ÊúâÂ§ö‰ΩôÁöÑÁ©∫Ê†º

    column = column.str.replace(r"[-]", " ", regex = True) # ÂØπ sub genre

    column = column.str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
    column = column.str.strip()

    return column

df_test_cat.loc[:, "playlist_genre"] = clean_genre(df_test_cat.loc[:, "playlist_genre"])
df_test_cat.loc[:, "playlist_subgenre"] = clean_genre(df_test_cat.loc[:, "playlist_subgenre"])

## Concatenate All the Texts

In [76]:
text_cols = df_test_cat.columns[[2, 5, 9, 10]]

for i in text_cols:
    print(f"{i} na number: ", df_test_cat[i].isna().sum())
print("")
for i in text_cols:
    print(f"0 length string in {i}: ", 0 in df_test_cat[i].str.len().values)

track_name na number:  0
track_album_name na number:  0
playlist_genre na number:  0
playlist_subgenre na number:  0

0 length string in track_name:  True
0 length string in track_album_name:  True
0 length string in playlist_genre:  False
0 length string in playlist_subgenre:  False


In [77]:
df_test_cat["combined_text"] = (df_test_cat[text_cols].agg(" ".join, axis=1) # str concatenate
                                 .str.replace(r"\s{2,}", " ", regex=True)
                                 .str.strip())

## Output

In [78]:
df_test_cat.to_csv("dataset/df_test_cat_rs_42_processed.csv", index=True, encoding="utf-8")

# Validation dataset 

In [79]:
df_validate_cat = pd.read_csv("dataset/df_validate_cat_rs_42.csv", index_col=0)
df_validate_num = pd.read_csv("dataset/df_validate_num_rs_42.csv", index_col=0)

df_validate_cat = pd.concat([df_validate_num.iloc[:, 0], df_validate_cat], axis=1)

In [80]:
mask = df_validate_cat.groupby("track_name")["track_popularity"].idxmax()
df_validate_cat = df_validate_cat.loc[mask]

## Text Cleaning

### 1. Song Title    

In [81]:
# title ÁªìÂ∞æÂ∏¶ (...)
# title ÁªìÂ∞æÂ∏¶ (... feat ...) ÊòØ # title ÁªìÂ∞æÂ∏¶ (...) ÁöÑ subset
process_df = df_validate_cat[df_validate_cat["track_name"].str.contains(r"\([^()]*\)\s*$", na=False)]
process_df

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
7283,26,0I7EJmk6S47mpVh73dB02r,1st & 10 (Featuring Infamous 2-0 & Fate Wilson),Ludacris,2kT80DHqRtHQzDhQ2RCoIV,Back For The First Time,2000-01-01,90s-2000s Southern Hip Hop / Crunk,5wsWBmQgDtKa8CEg7wTEMi,rap,southern hip hop,9,1
16728,70,0EdgK7ASb4kfRkW8pVMN02,2 Hearts (feat. Gia Koka),Sam Feldt,798RdmwmE3IZcZnW0jT7kc,2 Hearts (feat. Gia Koka),2020-01-10,Tropical House üèù 2020 Hits,2SRbIs0eBQwHeTP7kErjwo,latin,tropical,7,1
4350,64,4DAaQ5InUO23d8yNRbB0Yj,6's to 9's (feat. Rationale),Big Wild,7LZAgsQQr0cm61SmUdgKlz,Superdream,2019-02-01,2019 in Indie Poptimism,16RNbqnNCCLlBJti7JU5nc,pop,indie poptimism,8,1
2550,9,0Wv5wuenRLI3BcwgT3HPIP,A Different Way (with Lauv),DJ Snake,4urwuJbStO1K5ph63U7AZv,A Different Way (with Lauv),2017-09-21,Electropop Hits 2017-2020,7kyvBmlc1uSqsTL0EuNLrx,pop,electropop,8,1
18039,26,5t5v3w3RWdwDLJ7pqti9Kk,A Escondidas (feat. Jessi Leon),Bonka,5JQIT6JvMHDXzXeHWytfTy,M√Ås Que Ayer,2015-07-07,Latin Pop antiguo,4BmnnqWEQAamNoVjtjrQJP,latin,latin pop,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9383,35,4JGypFKEHrlufSjF8DPl9X,Yeah Ya Know (Takers),T.I.,1GBhTm8ndgzGhw1jD7dFcK,"We Run This, Vol. 5 (mixed by Mr. E of RPS Fam)",2010-12-03,RAP Gangsta,1Z1gW89x4MSBjkvVjGg7DQ,rap,gangster rap,1,1
2641,51,3Iqbhajo4nRQJU7ThH77Rt,You Remind Me (feat. Stanaj),Gryffin,4VOj6KgAaOqAUS9bp6z6jp,Gravity Pt. 1,2018-12-14,Electropop 2019,4Bi8VLtaSu0JILliif8lH6,pop,electropop,3,0
2379,84,0Ryd8975WihbObpp5cPW1t,boyfriend (with Social House),Ariana Grande,3zVB99XMdbP9HTVNg0GJwV,boyfriend,2019-08-02,post teen pop,3kvwJXPULhSUalL6ykJbn5,pop,post-teen pop,10,0
8678,22,1Elr2krap8BHDCofTxr4FH,–ì–∞–≥–∞—Ä–∏–Ω (feat. –ö–∞—Å–ø–∏–π—Å–∫–∏–π –≥—Ä—É–∑),Advaita,0dsmBtZ16GMKcRMfKzdUA1,–ö–æ–Ω—Ç—Ä–æ–ª—å–Ω—ã–π,2015-07-31,Russian Gangster Rap,0u0qLLE8MZc679RZWCk1TT,rap,gangster rap,5,0


In [82]:
# title Â∏¶ '-' 
df_validate_cat[df_validate_cat["track_name"].str.contains(r"-", case=False, na=False)]

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
7283,26,0I7EJmk6S47mpVh73dB02r,1st & 10 (Featuring Infamous 2-0 & Fate Wilson),Ludacris,2kT80DHqRtHQzDhQ2RCoIV,Back For The First Time,2000-01-01,90s-2000s Southern Hip Hop / Crunk,5wsWBmQgDtKa8CEg7wTEMi,rap,southern hip hop,9,1
14255,56,0uJyE3M3ecGZgzltsQKs3a,A Design for Life - Remastered,Manic Street Preachers,3jvKfPU4A2j7tVOFkhcEi2,Everything Must Go 10th Anniversary Edition,1996-05-20,Permanent Wave,5glAD13obyL0G9SH9ukBz2,rock,permanent wave,0,1
31421,9,6lheVNyIjiezy9IgLfWqq6,A Million Lights - feat. Zo√´ Badwi [Original],Grant Smillie,1keHPlXQTCd2dIVD8K7yRE,A Million Lights (feat. Zo√´ Badwi),2012-09-07,Vocal House,5PCAWKfUWAUj8VeY8G7xRQ,edm,progressive electro house,9,1
18176,54,2WXTF0qgKbczC2O8VymeLO,A Sky Full of Stars - Live at the Royal Albert...,Coldplay,1hNS0RsxPTFjmKXCgmjSLS,Ghost Stories Live 2014,2014-11-21,Unplugged Hits üì£,5NTm3injIRkUMROsZr3C2O,latin,latin pop,6,1
17868,17,3T5QSIgoaPIh07SwixRDp7,A Solas - Remix,Lunay,4kkZsCc8FiqMmq1sLBk6Na,A Solas Remix,2018-12-07,LATIN POP 2020 üî• Pop latino actual,2kKzN3kRYDzBctlaWs7CP2,latin,latin pop,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,72,7KksdXBhdufqWDxGxyt4I7,no14 - feat. Dani Faiv,tha Supreme,42wflBbrb9OchJfd3qiGRO,23 6451,2019-11-15,Zona Trap,37i9dQZF1DWXU2naFUn37x,rap,trap,10,0
9983,74,7gRPkCEa3YMaxET5ZYT7Ni,occh1 purpl3 - feat. Marracash,tha Supreme,42wflBbrb9OchJfd3qiGRO,23 6451,2019-11-15,Zona Trap,37i9dQZF1DWXU2naFUn37x,rap,trap,2,1
30354,60,4v6SAnzwEr7s2m0gFQnJFJ,√Åudio - Ao Vivo em Bras√≠lia,Diego & Victor Hugo,6PlcH8kJmbJIagbWVuqHz7,Diego & Victor Hugo Ao Vivo em Bras√≠lia,2019-07-12,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,11,1
27025,11,6jHeunClv5nFVXQ6jtnaWm,√âtude post-baroque,Nhyx,7xyLot8qvUDna1e1VyY8Jj,Z√©phyr,2020-01-17,Electro Pos√© - Discoveries,6nZaTh6K1SwhdELFTmA99C,edm,electro house,2,1


Same as processing training dataset

Ê†∏ÂøÉÔºöÂè™‰øùÁïôÊØèÈ¶ñÊ≠åÁöÑmain title

1. Â§ÑÁêÜ title Â∏¶ '-' ÁöÑ

2. Â§ÑÁêÜ title Â∏¶ '()'ÁöÑ

In [83]:
df_validate_cat["track_name"] = df_validate_cat["track_name"].fillna("")
df_validate_cat["track_album_name"] = df_validate_cat["track_album_name"].fillna("")

df_validate_cat["track_name"] = df_validate_cat["track_name"].str.lower() # lowercase song title
df_validate_cat["track_name"] = df_validate_cat["track_name"].map(lambda x: unidecode(x)) # Â§ÑÁêÜÁ±ªËã±ÊñáletterÔºåËá≥Ëã±Êñá

print("Number of title with none English letter: ", df_validate_cat["track_name"].str.contains(r"[^\x00-\x7F]+", regex=True).sum())

Number of title with none English letter:  0


In [84]:
# title Â∏¶ '-' ÁöÑ‰∏ÄÂÖ± 670
process_df = df_validate_cat[df_validate_cat["track_name"].str.contains(r"-", case=False, na=False)]

print("\n rows: ", process_df.shape[0])
process_df


 rows:  670


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
7283,26,0I7EJmk6S47mpVh73dB02r,1st & 10 (featuring infamous 2-0 & fate wilson),Ludacris,2kT80DHqRtHQzDhQ2RCoIV,Back For The First Time,2000-01-01,90s-2000s Southern Hip Hop / Crunk,5wsWBmQgDtKa8CEg7wTEMi,rap,southern hip hop,9,1
14255,56,0uJyE3M3ecGZgzltsQKs3a,a design for life - remastered,Manic Street Preachers,3jvKfPU4A2j7tVOFkhcEi2,Everything Must Go 10th Anniversary Edition,1996-05-20,Permanent Wave,5glAD13obyL0G9SH9ukBz2,rock,permanent wave,0,1
31421,9,6lheVNyIjiezy9IgLfWqq6,a million lights - feat. zoe badwi [original],Grant Smillie,1keHPlXQTCd2dIVD8K7yRE,A Million Lights (feat. Zo√´ Badwi),2012-09-07,Vocal House,5PCAWKfUWAUj8VeY8G7xRQ,edm,progressive electro house,9,1
18176,54,2WXTF0qgKbczC2O8VymeLO,a sky full of stars - live at the royal albert...,Coldplay,1hNS0RsxPTFjmKXCgmjSLS,Ghost Stories Live 2014,2014-11-21,Unplugged Hits üì£,5NTm3injIRkUMROsZr3C2O,latin,latin pop,6,1
17868,17,3T5QSIgoaPIh07SwixRDp7,a solas - remix,Lunay,4kkZsCc8FiqMmq1sLBk6Na,A Solas Remix,2018-12-07,LATIN POP 2020 üî• Pop latino actual,2kKzN3kRYDzBctlaWs7CP2,latin,latin pop,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30354,60,4v6SAnzwEr7s2m0gFQnJFJ,audio - ao vivo em brasilia,Diego & Victor Hugo,6PlcH8kJmbJIagbWVuqHz7,Diego & Victor Hugo Ao Vivo em Bras√≠lia,2019-07-12,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,11,1
27025,11,6jHeunClv5nFVXQ6jtnaWm,etude post-baroque,Nhyx,7xyLot8qvUDna1e1VyY8Jj,Z√©phyr,2020-01-17,Electro Pos√© - Discoveries,6nZaTh6K1SwhdELFTmA99C,edm,electro house,2,1
9458,18,5cDO8fIEgNUT2bMlWvrQnH,mai 45-go,StaFFord63,6XMf8HgW12QA6Xlb64KVF7,–ú–∞–π 45-–≥–æ,2019-05-03,RUSSIAN Gangster Rap,0Jw0HckkxCfIrOvpN081eV,rap,gangster rap,9,0
26338,30,1DWfjSM4lKBjg7hQLwYkr2,hitoXia notapesutori-,Tomoko Aran,7fa6VpNPvW2DVbtohJ0x44,ÊµÆÈÅäÁ©∫Èñì,1983,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,10,1


In [85]:
 # '- ... mix ...', '- ... remaster ...', '- ... version ...' , '- ... edit ...'

pattern = r"-.*(mix|remaster|version|edit).*?$"
df_validate_cat[df_validate_cat["track_name"].str.contains(pattern, case = False, na = False)]

  df_validate_cat[df_validate_cat["track_name"].str.contains(pattern, case = False, na = False)]


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
14255,56,0uJyE3M3ecGZgzltsQKs3a,a design for life - remastered,Manic Street Preachers,3jvKfPU4A2j7tVOFkhcEi2,Everything Must Go 10th Anniversary Edition,1996-05-20,Permanent Wave,5glAD13obyL0G9SH9ukBz2,rock,permanent wave,0,1
17868,17,3T5QSIgoaPIh07SwixRDp7,a solas - remix,Lunay,4kkZsCc8FiqMmq1sLBk6Na,A Solas Remix,2018-12-07,LATIN POP 2020 üî• Pop latino actual,2kKzN3kRYDzBctlaWs7CP2,latin,latin pop,6,0
8514,48,6Uw77XfBxwg7VX6mMdqAmT,arrogant american freestyle - remix,Riff Raff,7wzqmeAHlPmQBcoEaB8TTg,ARROGANT AMERiCAN FREESTYLE (REMIX),2020-01-10,Gangster Rap,5joQabz9ys3XPGbSx5CaKv,rap,gangster rap,1,0
18717,38,3IH2zs1TSPFA2Iy7e5HCmT,ace - radio edit,Rewind,68rUoEpgweglqsBF4fOSRp,Ace,2019-10-13,Great Pops,5SO7xYih5w8X5FEpKvU7Rt,latin,latin pop,1,1
12158,35,6nnH5zeFZOH3HMpEOG5DLg,action this day - remastered 2011,Queen,6reTSIf5MoBco62rk8T7Q1,Hot Space (2011 Remaster),1982-05-03,The Queen - La Discografia Completa,3E88dLx4fgFYY70gdGzdnB,rock,album rock,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28286,62,3bmbNLqSJjGDRbbqlJUt4y,your body - cat dealers radio edit,Tom Novy,5gCptfTCvRbWxTp10t1Yt0,Your Body (feat. Michael Marshall) [Cat Dealer...,2017-07-28,üîäBASSBOOSTEDüîä‚ö°ELECTRO HOUSE‚ö°üî•EDM CAR MUSIC2018...,4GSiiL8tcMgvoV7K1IADb8,edm,electro house,9,1
27865,50,6R9MD6OD2EGOTnblzvi3mw,your mind - will clarke remix,Adam Beyer,6bk7IZwVsHrcoJi94rGuyN,"A-Sides, Vol. 8",2019-09-23,üíäELECTRO-HOUSE-TECHüíä,0AFYmoSuoMQiGGjzvBwr6u,edm,electro house,7,1
15654,32,3PPJVaa7maBPrYA4OS6gyG,ziggy stardust - 1997 remaster,David Bowie,3htuY0OiwfkMfM6NJCRyrI,The Best Of David Bowie 1969-74,1997-10-27,Classic Hard Rock,1gAeQGQ7hr2q7IYmaejDW6,rock,hard rock,7,1
17327,11,0W5SiYQOGUsgpmL86KkPJz,zipolite - remix,Zilvano,6oklpW9cdzpRpdGyz6w4cf,Zipolite (Remix),2020-01-10,Para√≠so Tropical,2v41efarffe6iY8qpMK5Gj,latin,tropical,9,1


In [86]:
#'- ... mix ...', '- ... remaster ...', '- ... version ...' , '- ... edit ...' Âç† Â∏¶ '-' ÁöÑ proportion
print("Proportion of titles that have sub-title in the form of '- ... mix ...', '- ... remaster ...', '- ... version ...' , or '- ... edit ...': ",
      round(df_validate_cat[df_validate_cat["track_name"].str.contains(pattern, case = False, na = False)].shape[0] / df_validate_cat[df_validate_cat["track_name"].str.contains(r"-", case=False, na=False)].shape[0], 3)
      )

Proportion of titles that have sub-title in the form of '- ... mix ...', '- ... remaster ...', '- ... version ...' , or '- ... edit ...':  0.851


  round(df_validate_cat[df_validate_cat["track_name"].str.contains(pattern, case = False, na = False)].shape[0] / df_validate_cat[df_validate_cat["track_name"].str.contains(r"-", case=False, na=False)].shape[0], 3)


ÈúÄË¶ÅÁâπÊÆäÂ§ÑÁêÜÁöÑ‰∏ÄÁªÑtitle

In [87]:
process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)]

  process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)]


Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
7283,26,0I7EJmk6S47mpVh73dB02r,1st & 10 (featuring infamous 2-0 & fate wilson),Ludacris,2kT80DHqRtHQzDhQ2RCoIV,Back For The First Time,2000-01-01,90s-2000s Southern Hip Hop / Crunk,5wsWBmQgDtKa8CEg7wTEMi,rap,southern hip hop,9,1
31421,9,6lheVNyIjiezy9IgLfWqq6,a million lights - feat. zoe badwi [original],Grant Smillie,1keHPlXQTCd2dIVD8K7yRE,A Million Lights (feat. Zo√´ Badwi),2012-09-07,Vocal House,5PCAWKfUWAUj8VeY8G7xRQ,edm,progressive electro house,9,1
18176,54,2WXTF0qgKbczC2O8VymeLO,a sky full of stars - live at the royal albert...,Coldplay,1hNS0RsxPTFjmKXCgmjSLS,Ghost Stories Live 2014,2014-11-21,Unplugged Hits üì£,5NTm3injIRkUMROsZr3C2O,latin,latin pop,6,1
7592,48,6cnufgF6qLJGZT2PRFZBr1,ain't i - main voc up,Yung L.A.,4wCjej94XDH2kKIc6vBqbI,Ain't I,2008-01-01,The Sound of Southern Hip Hop,18jT9NMRZifv6cMtK2jWD4,rap,southern hip hop,11,1
23505,17,4muAGMNipTEtIoYfMa9QL0,amsterdam - recorded at spotify studios nyc,Mandolin Orange,774yLThHwXXqrrH2bgF912,Spotify Singles,2018-04-04,Bluegrass Covers,37i9dQZF1DX56crgoe4TG3,r&b,hip pop,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30354,60,4v6SAnzwEr7s2m0gFQnJFJ,audio - ao vivo em brasilia,Diego & Victor Hugo,6PlcH8kJmbJIagbWVuqHz7,Diego & Victor Hugo Ao Vivo em Bras√≠lia,2019-07-12,Ver√£o 2020 | Pop | Funk | Sertanejo | EDM | To...,5HmZtuuIDMtIy21kylqhx6,edm,pop edm,11,1
27025,11,6jHeunClv5nFVXQ6jtnaWm,etude post-baroque,Nhyx,7xyLot8qvUDna1e1VyY8Jj,Z√©phyr,2020-01-17,Electro Pos√© - Discoveries,6nZaTh6K1SwhdELFTmA99C,edm,electro house,2,1
9458,18,5cDO8fIEgNUT2bMlWvrQnH,mai 45-go,StaFFord63,6XMf8HgW12QA6Xlb64KVF7,–ú–∞–π 45-–≥–æ,2019-05-03,RUSSIAN Gangster Rap,0Jw0HckkxCfIrOvpN081eV,rap,gangster rap,9,0
26338,30,1DWfjSM4lKBjg7hQLwYkr2,hitoXia notapesutori-,Tomoko Aran,7fa6VpNPvW2DVbtohJ0x44,ÊµÆÈÅäÁ©∫Èñì,1983,Japanese Funk/Soul/NEO/Jazz/Acid,4zNayWuATXCAA9gaXvnFnq,r&b,neo soul,10,1


Ëøô‰∫õ index ÊòØÈúÄË¶ÅÁã¨Á´ãÂ§ÑÁêÜÁöÑ

In [88]:
process_special_idx_lis = [32459, 22334, 442, 6992, 26097, 8185, 7197, 14435, 9775, 16932, 30791, 5259, 27025, 9458
                           ]
len(process_special_idx_lis)

#remove_idx = [8958, 32459]
#remove_idx = [8958, 32459, 26362, 13943, 25752, 4634]
#process_special_idx_lis = pd.Series(process_special_idx_lis)
#process_special_idx_lis = process_special_idx_lis[~process_special_idx_lis.isin(remove_idx)].tolist()

14

In [89]:
process_df_sub = process_df[process_df["track_name"].str.contains(pattern, case = False, na = False)] # Áõ¥Êé•ÂéªÊéâ ‚Äò-‚Äô, row 570
process_df_sub_1 = process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)] # ÈúÄÁâπÂú∞Â§ÑÁêÜ, row 100

  process_df_sub = process_df[process_df["track_name"].str.contains(pattern, case = False, na = False)] # Áõ¥Êé•ÂéªÊéâ ‚Äò-‚Äô, row 570
  process_df_sub_1 = process_df.loc[~ process_df["track_name"].str.contains(pattern, case = False, na = False)] # ÈúÄÁâπÂú∞Â§ÑÁêÜ, row 100


Â§ÑÁêÜ 570 row ÁöÑÈÉ®ÂàÜÔºåÁõ¥Êé•ÂéªÊéâ Âê´'-'ÂêéÈù¢ÁöÑÂÜÖÂÆπ

In [90]:
process_df_sub.loc[:, "track_name"] = clean_title_part_1(process_df_sub.loc[:, "track_name"])

ÂàÜ‰∏∫ special Âíå non-special

In [91]:
process_df_sub_1_special = process_df_sub_1.loc[process_special_idx_lis] # row 27
process_df_sub_1_non_special = process_df_sub_1.loc[~ process_df_sub_1.index.isin(process_special_idx_lis)] # row 111

Â§ÑÁêÜ special

In [92]:
process_df_sub_1_special.loc[:,"track_name"] = clean_title_part_2(process_df_sub_1_special.loc[:,"track_name"])

Â§ÑÁêÜ non-special

In [93]:
process_df_sub_1_non_special.loc[:,"track_name"] = clean_title_part_3(process_df_sub_1_non_special.loc[:,"track_name"])

ËΩ¨ÁßªÂà∞ process_df

In [94]:
process_df.loc[process_df_sub.index, "track_name"] = process_df_sub["track_name"]
process_df.loc[process_df_sub_1_special.index, "track_name"] = process_df_sub_1_special["track_name"]
process_df.loc[process_df_sub_1_non_special.index, "track_name"] = process_df_sub_1_non_special["track_name"]

df_validate_cat.loc[process_df.index, "track_name"] = process_df["track_name"] #ËΩ¨ÁßªÂà∞ df_train_cat

In [95]:
pattern = r"[^\w\s]"  # ÈùûÂ≠óÊØçÊï∞Â≠ó„ÄÅÈùûÁ©∫ÁôΩÁöÑÂ≠óÁ¨¶
punctu_lis = df_validate_cat.loc[~ df_validate_cat["track_name"].str.contains(r"[()]", regex=True), "track_name"].str.findall(pattern).explode().unique()
punctu_lis = punctu_lis[1:]
punctu_lis

array(['$', '.', "'", nan, '%', ':', '/', '!', '[', ']', ',', '"', '&',
       '?', '*', '+', '|', '<', '>'], dtype=object)

In [96]:
df_validate_cat.loc[:, "track_name"] = clean_title_part_4(df_validate_cat.loc[:, "track_name"])

In [97]:
df_validate_cat.loc[~ df_validate_cat["track_name"].str.contains(r"[()]", regex=True), "track_name"].str.findall(pattern).explode().unique()

array([nan, '$', '*'], dtype=object)

Â§ÑÁêÜ $

In [98]:
df_validate_cat.loc[df_validate_cat["track_name"].str.contains("$", regex=False), "track_name"]

21143    $ a n t e r i a
10595             $enhor
29206          a$ian boy
21664     bernice burgo$
10448              fre$h
22276                 m$
Name: track_name, dtype: object

In [99]:
#mask = [6328, 29477, 6148, 21524, 3128]
mask = [10595, 29206, 21664, 10448, 22276, 21143]
df_validate_cat.loc[mask, "track_name"] = df_validate_cat.loc[mask, "track_name"].str.replace("$", "s", regex=False)

df_validate_cat.loc[21143, "track_name"] = df_validate_cat.loc[21143, "track_name"].replace(" ", "")

Â§ÑÁêÜ *


In [100]:
process_df = df_validate_cat[df_validate_cat["track_name"].str.contains("*", regex=False)]

In [101]:
process_df

Unnamed: 0,track_popularity,track_id,track_name,track_artist,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,key,mode
28754,29,0B5GzF48NCJ4ltvBLpVcLi,f*ck,Bassjackers,5AVczO9dRt4JKMvQhDSCh2,F*CK (Dimitri Vegas & Like Mike Edit),2016-06-10,BIG-ROOM NEVER DIES !,4QtO2XySphMOJt7pX0yaVi,edm,big room,9,0
9353,68,49zD0wr2S3d0lZPib0K4e1,f*ck up some commas,Future,0fUy6IdLHDpGNwavIlhEsl,DS2 (Deluxe),2015-07-17,RAP Gangsta,1Z1gW89x4MSBjkvVjGg7DQ,rap,gangster rap,11,0
7936,1,45kgqsq1g9liM5tvdKNhLa,f*ckwithmeyouknowigotit,JAY-Z,37uqAKt9dLsLob7yomDWY4,Magna Carta... Holy Grail,2013-07-10,Hip-Hop 'n RnB,0275i1VNfBnsNbPl0QIBpG,rap,southern hip hop,10,0
28032,42,6nlCPdygwRryI79rUbWx4m,let get f*cked up,MAKJ,15JJXzr7hNE4sdUmpBgRiQ,Let's Get F*cked Up,2014-03-17,Fitness Workout Electro | House | Dance | Prog...,6KnQDwp0syvhfHOR4lWP7x,edm,electro house,0,1
31178,32,7d6c9PjkpsKnLOGPkeeBFU,what the f**k,Fatboy Slim,6L6QL2oG9BCvZcjPc1e3V3,Best of the Bootlegs,2010-09-13,Selected House,2JPzPB9jnvJLAYtmCbvZy8,edm,progressive electro house,7,1


In [102]:
process_df.loc[28754, "track_name"] = process_df.loc[28754, "track_name"].replace("*", "u")
process_df.loc[9353, "track_name"] = process_df.loc[9353, "track_name"].replace("*", "u")
process_df.loc[7936, "track_name"] = process_df.loc[7936, "track_name"].replace("*", "u")
process_df.loc[28032, "track_name"] = process_df.loc[28032, "track_name"].replace("*", "u")
process_df.loc[31178, "track_name"] = re.sub(r"\*\*", "uc", process_df.loc[31178, "track_name"])

process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
process_df.loc[:,"track_name"] = process_df.loc[:,"track_name"].str.strip()

df_validate_cat.loc[process_df.index, "track_name"] = process_df["track_name"] #ËΩ¨ÁßªÂà∞ df_validate_cat

Â§ÑÁêÜ@ 

In [103]:
df_validate_cat["track_name"].str.contains('*', regex=False).sum()

np.int64(0)

### 2. Album name

In [104]:
df_validate_cat.loc[:, "track_album_name"] = df_validate_cat.loc[:, "track_album_name"].str.lower()
df_validate_cat.loc[:, "track_album_name"] = df_validate_cat.loc[:, "track_album_name"].map(lambda x: unidecode(x))

print("Number of album name with none English letter: ", df_validate_cat["track_album_name"].str.contains(r"[^\x00-\x7F]+", regex=True).sum())

Number of album name with none English letter:  0


In [105]:
pattern = r"[^\w\s]" 
punctu_lis = df_validate_cat["track_album_name"].str.findall(pattern).explode().unique()
#punctu_lis = punctu_lis[1:]
punctu_lis

array(['#', nan, '(', ')', '[', '.', ',', '&', ']', "'", '/', '!', '?',
       '-', '$', '"', ':', '+', '=', '*', '~', '%', '`', '{', '}', ';',
       '<', '>'], dtype=object)

In [106]:
df_validate_cat.loc[:, "track_album_name"] = clean_album_name(df_validate_cat.loc[:, "track_album_name"])

In [107]:
df_validate_cat.loc[:, "track_album_name"].str.findall(pattern).explode().unique()

array([nan, '$', '&', '*'], dtype=object)

$ *

‰∏ìÈó®Â§ÑÁêÜ &

In [108]:
mask = df_validate_cat["track_album_name"].str.contains(r"(?i)r&b", regex=True)

df_validate_cat.loc[~mask, "track_album_name"] = (
df_validate_cat.loc[~mask, "track_album_name"]
       .str.replace("&", " ", regex=False)
     .str.replace(r"\s{2,}", " ", regex=True)
        .str.strip()
)

Â§ÑÁêÜ $, Êç¢Êàê s

In [109]:
mask = [29206, 8315, 21664, 10448, 9935, 22276]
df_validate_cat.loc[mask, "track_album_name"] = df_validate_cat.loc[mask, "track_album_name"].str.replace("$", "s", regex=False)

Â§ÑÁêÜ *

In [110]:
process_df = df_validate_cat[df_validate_cat["track_album_name"].str.contains("*", regex=False)]

process_df.loc[28754, "track_album_name"] = re.sub(r"\*", "u", process_df.loc[28754, "track_album_name"])
process_df.loc[28032, "track_album_name"] = re.sub(r"\*", "u", process_df.loc[28032, "track_album_name"])

process_df.loc[:,"track_album_name"] = process_df.loc[:,"track_album_name"].str.replace(r"\s{2,}", " ", regex=True) # ‰∏§Ê≠•ÊääÂ§ö‰ΩôÁ©∫Ê†ºÂéãÊàê‰∏Ä‰∏™
process_df.loc[:,"track_album_name"] = process_df.loc[:,"track_album_name"].str.strip()

df_validate_cat.loc[process_df.index, "track_album_name"] = process_df["track_album_name"] #ËΩ¨ÁßªÂà∞ df_validate_cat

### 3. genre and sub-genre

In [111]:
df_validate_cat.loc[:, "playlist_genre"] = clean_genre(df_validate_cat.loc[:, "playlist_genre"])
df_validate_cat.loc[:, "playlist_subgenre"] = clean_genre(df_validate_cat.loc[:, "playlist_subgenre"])

## Concatenate All the Texts

In [112]:
text_cols = df_validate_cat.columns[[2, 5, 9, 10]]

for i in text_cols:
    print(f"{i} na number: ", df_validate_cat[i].isna().sum())
print("")
for i in text_cols:
    print(f"0 length string in {i}: ", 0 in df_validate_cat[i].str.len().values)

track_name na number:  0
track_album_name na number:  0
playlist_genre na number:  0
playlist_subgenre na number:  0

0 length string in track_name:  False
0 length string in track_album_name:  True
0 length string in playlist_genre:  False
0 length string in playlist_subgenre:  False


In [113]:
df_validate_cat["combined_text"] = (df_validate_cat[text_cols].agg(" ".join, axis=1) # str concatenate
                                 .str.replace(r"\s{2,}", " ", regex=True)
                                 .str.strip())

## Output

In [114]:
df_validate_cat.to_csv("dataset/df_validate_cat_rs_42_processed.csv", index=True, encoding="utf-8")