## Importing required libraries

In [33]:
import pandas as pd
import numpy as np
import re
from impyute.imputation.cs import mice
import warnings
warnings.simplefilter("ignore")

### Funcion to clean the data

In [34]:
def b2s(value):
    value = re.sub(r"^b'",'',value)
    value = re.sub(r'^b','',value)
    value = re.sub(r"'",'',value)
    value = re.sub(r'\"','',value)
    value = re.sub(r" ?\([^)]+\)", "", value)
    value = re.sub(r'\,','',value)
    value = re.sub(r'\’','',value)
    return value

# Load The data

In [35]:
dataset = pd.read_csv('Datasets/Million.csv').drop(['Unnamed: 0'],axis = 1)
df_bb = pd.read_csv('Datasets/Billboardtop100.csv').drop('Unnamed: 0',axis = 1)
df_y = pd.read_csv('Datasets/Years.csv').drop('Unnamed: 0',axis = 1)
df_y['Title'] = df_y['Title'].apply(lambda x: re.sub('[^0-9a-zA-Z\s]','',str(x)))

# Pre-Processing

#### 1. Stripping [] and b''

In [36]:
for col in dataset:
    if dataset[col].dtype==object:
        dataset[col]=dataset[col].str.replace("b'","")
        dataset[col]=dataset[col].str.replace("'","")
        dataset[col]=dataset[col].str.strip('[]')

#### 2. Cleaning the title column of billboard dataset

In [37]:
df_bb['Title'] = df_bb['Title'].apply(lambda x:b2s(x))
df_bb['Title']= df_bb['Title'].drop_duplicates(keep='first')
df_bb = df_bb.dropna()

3. Function to add song popularity from bill board charts

In [38]:
def hits(data):
    if data['title'] in list(df_bb['Title']):
        return 1
    else:
        return 0
dataset['song_hotttnesss'].describe()

count    5648.000000
mean        0.342822
std         0.247220
min         0.000000
25%         0.000000
50%         0.360371
75%         0.537504
max         1.000000
Name: song_hotttnesss, dtype: float64

In [39]:
dataset['song_hotttnesss'] = dataset['song_hotttnesss'].fillna(0) 

#### 4. Classifying songs as popular which have popularity over 75 percentile

In [40]:
for i in range(0,len(dataset['song_hotttnesss'])):
    if dataset['song_hotttnesss'][i]< 0.538:
        dataset['song_hotttnesss'][i] = 0
    elif (dataset['song_hotttnesss'][i] >= 0.538):
        dataset['song_hotttnesss'][i] = 1

##### Classify the songs found as popular (hottness=1)

In [41]:
df_comp = pd.DataFrame(dataset[dataset['song_hotttnesss']==0]['title'])
df_comp['song_hottness'] = df_comp.apply(hits,axis = 1)
df_comp = df_comp[df_comp['song_hottness']==1]

In [42]:
print(df_comp.shape)

(388, 2)


##### Merge Songs which have the same title

In [43]:
for i in range(0,10000):
    if dataset['title'][i] in list(df_comp['title']):
        dataset['song_hotttnesss'][i] = 1
print(dataset['song_hotttnesss'].value_counts())

0.0    8203
1.0    1797
Name: song_hotttnesss, dtype: int64


#### 5. Function to clean the symbols from array columns

In [54]:
def clean_symbols(value):
    for i in range(len(dataset)):
        dataset[value][i] = re.sub(r'  ',' ',str(dataset[value][i]))
        dataset[value][i] = re.sub(r'\[',' ',str(dataset[value][i]))
        dataset[value][i] = re.sub(r'\]',' ',str(dataset[value][i]))
        dataset[value][i] = re.sub(r'\.\.\.',' ',str(dataset[value][i]))
        l1 = str(dataset[value][i]).split()
        mean = np.mean([float(i) for i in l1])
        dataset[value][i] = mean

In [55]:
col = ['bars_start','beats_confidence','beats_start','beats_confidence','bars_confidence','sections_confidence','sections_start','segments_confidence','segments_loudness_max','segments_loudness_max_time','segments_loudness_start','segments_start','segments_pitches','tatums_confidence','tatums_start','segments_timbre']
for item in col:
    clean_symbols(item)    
dataset.head()

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,transfer_note,year
0,22050.0,165270.0,0.581794,0.401998,ARD7TVE1187B99BFB1,,California - LA,,e77e51a5-4761-45b3-9847-2051f811e366,,...,0.455733,108.824,92.198,4.0,0.778,"b""I Didnt Mean To""",3401791.0,TRAAAAW128F429D538,transferred on Thu Jan 2 18:08:59 2020 from f...,0.0
1,22050.0,1998.0,0.63063,0.4175,ARMJAGH1187FB546F3,35.14968,"Memphis, TN",-90.04892,1c78ab62-db33-4433-8d0b-7c8dcf1849c2,classic pop and rock,...,0.616008,73.2011,121.274,4.0,0.384,Soul Deep,3400270.0,TRAAABD128F429CF47,transferred on Thu Jan 2 18:08:59 2020 from f...,1969.0
2,22050.0,290021.0,0.487357,0.343428,ARKRRTF1187B9984DA,,,,7a273984-edd9-4451-9c4d-39b38f05ebcd,,...,0.33256,86.6473,100.07,1.0,0.0,Amor De Cabaret,5703798.0,TRAAADZ128F9348C2E,transferred on Thu Jan 2 18:08:59 2020 from f...,0.0
3,22050.0,19072.0,0.630382,0.454231,AR7G5I41187FB4CE6C,,"London, England",,e188a520-9cb7-4f73-a3d7-2f70c6538e92,uk british english,...,0.260192,116.617,119.293,4.0,0.0,Something Girls,3226795.0,TRAAAEF128F4273421,transferred on Thu Jan 2 18:08:59 2020 from f...,1982.0
4,22050.0,30973.0,0.651046,0.401724,ARXR32B1187FB57099,,,,c6903a2e-063c-4f91-a284-17b8f421be7,,...,0.257529,102.531,129.738,4.0,0.562,Face the Ashes,6795666.0,TRAAAFD128F92F423A,transferred on Thu Jan 2 18:08:59 2020 from f...,2007.0


### Create a Feature column

In [46]:
features = dataset.drop(['analysis_sample_rate','artist_7digitalid','artist_latitude','artist_longitude','song_id','track_7digitalid','track_id','transfer_note','artist_id','artist_mbid','artist_playmeid','artist_mbtags','artist_mbtags_count','audio_md5','release_7digitalid','similar_artists','title','song_hotttnesss','artist_terms','artist_terms_freq','artist_terms_weight','release','artist_location','artist_name'],axis=1)
features.head()

Unnamed: 0,artist_familiarity,artist_hotttnesss,bars_confidence,bars_start,beats_confidence,beats_start,danceability,duration,end_of_fade_in,energy,...,segments_pitches,segments_start,segments_timbre,start_of_fade_out,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,year
0,0.581794,0.401998,0.172096,108.164,0.611462,108.983,0.0,218.93179,0.247,0.0,...,0.461889,108.114,-11.715,218.932,0.455733,108.824,92.198,4.0,0.778,0.0
1,0.63063,0.4175,0.122562,71.9683,0.730807,73.2008,0.0,148.03546,0.148,0.0,...,0.54725,72.8932,16.3049,137.915,0.616008,73.2011,121.274,4.0,0.384,1969.0
2,0.487357,0.343428,0.43055,86.7962,0.43055,86.7962,0.0,177.47546,0.282,0.0,...,0.377611,86.839,17.4227,172.304,0.33256,86.6473,100.07,1.0,0.0,0.0
3,0.630382,0.454231,0.118609,115.989,0.621877,116.743,0.0,233.40363,0.0,0.0,...,0.3085,112.635,30.0667,217.124,0.260192,116.617,119.293,4.0,0.0,1982.0
4,0.651046,0.401724,0.127936,101.84,0.435171,102.531,0.0,209.60608,0.066,0.0,...,0.471611,98.9574,24.9604,198.699,0.257529,102.531,129.738,4.0,0.562,2007.0


In [47]:
features = features.replace('',np.nan)

In [48]:
for i in features.columns:
    if features[i].dtype == 'O':
        features[i] = features[i].astype(float)

##### # Replacing 0 with nan values as there is a very little chance for mean to be 0. This implies that the values are missing.

In [49]:
for i in col:
    features[i] = features[i].replace(0,np.nan)

In [50]:
features['year'] = features['year'].astype(int)
features['year'] = features['year'].replace(0,np.nan)

In [51]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   artist_familiarity          9996 non-null   float64
 1   artist_hotttnesss           10000 non-null  float64
 2   bars_confidence             9935 non-null   float64
 3   bars_start                  9970 non-null   float64
 4   beats_confidence            9845 non-null   float64
 5   beats_start                 9975 non-null   float64
 6   danceability                10000 non-null  float64
 7   duration                    10000 non-null  float64
 8   end_of_fade_in              10000 non-null  float64
 9   energy                      10000 non-null  float64
 10  key                         10000 non-null  float64
 11  key_confidence              10000 non-null  float64
 12  loudness                    10000 non-null  float64
 13  mode                        1000

#### Filling missing values using mice

In [52]:
features_array = mice(np.array(features))

#### Save the featrues and dataset to csv

In [53]:
features = pd.DataFrame(features_array,columns=features.columns)
features.year = features.year.astype(int)
features.to_csv('Datasets/Feature_List.csv')
dataset.to_csv('Datasets/Million_final.csv')