# Data Wrangling

## Importing Libraries & Dataset

In [218]:
# Importing required libraries and packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC

In [68]:
# Load dataset
disliked_data = pd.read_csv('disliked_data.csv')
liked_data = pd.read_csv('liked_data.csv')

In [69]:
liked_data.head()

Unnamed: 0,#,TITLE,ARTIST,RELEASE,BPM,ENERGY,DANCE,LOUD,VALENCE,LENGTH,ACOUSTIC,POP.,RND
0,1,"Psycho, Pt. 2",Russ,11/8/2016,83,55,77,-4,42,2:42,78,79,9568
1,2,R.I.P. Fredo (feat. Young Nudy) - Notice Me,Playboi Carti,5/11/2018,140,76,91,-5,63,2:41,10,65,9739
2,3,ROXANNE,Arizona Zervas,10/10/2019,117,60,62,-6,46,2:44,5,93,138
3,4,A$AP Forever,A$AP Rocky,4/5/2018,126,78,47,-6,44,3:53,22,70,7739
4,5,Funkin Fun,Scotty Sire,9/16/2019,120,66,84,-7,65,3:07,8,50,8788


In [70]:
disliked_data.head()

Unnamed: 0,#,TITLE,ARTIST,RELEASE,BPM,ENERGY,DANCE,LOUD,VALENCE,LENGTH,ACOUSTIC,POP.,RND
0,1,We Were,Keith Urban,5/14/2019,79,80,44,-6,63,3:09,13,73,4164
1,2,What She Wants Tonight,Luke Bryan,10/24/2019,170,90,48,-4,72,3:07,6,72,2755
2,3,I Hope You’re Happy Now,Carly Pearce,9/27/2019,118,82,59,-5,31,3:19,17,71,2982
3,4,Heartache Medication,Jon Pardi,5/20/2019,95,79,62,-3,73,3:29,4,74,4149
4,5,Tip of My Tongue,Kenny Chesney,7/12/2019,95,56,67,-5,49,3:19,8,75,2327


In [71]:
liked_data['Class'] = 1

In [72]:
disliked_data['Class'] = 0

In [73]:
# Combine the datasets
combined_data = pd.concat([liked_data, disliked_data], axis = 0)

In [74]:
combined_data.sample(5)

Unnamed: 0,#,TITLE,ARTIST,RELEASE,BPM,ENERGY,DANCE,LOUD,VALENCE,LENGTH,ACOUSTIC,POP.,RND,Class
640,641,Someone You Loved,Lewis Capaldi,5/17/2019,110,41,50,-6,45,3:02,75,96,74,1
318,319,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",YG,5/25/2018,204,34,74,-8,12,3:58,1,6,7563,1
676,677,Lekko,Alonzo Gautier,7/5/2019,114,10,39,-24,22,2:23,99,64,7342,0
238,239,Hypnotize - 2014 Remaster,The Notorious B.I.G.,3/4/1997,94,70,90,-3,67,3:50,14,77,7776,1
648,649,Jesie?,Aleksy Wysocki,11/8/2019,148,2,41,-26,50,1:59,100,60,929,0


In [75]:
combined_data.drop(['#', 'RELEASE'], axis = 1, inplace = True)

In [77]:
combined_data.sample(5)

Unnamed: 0,TITLE,ARTIST,BPM,ENERGY,DANCE,LOUD,VALENCE,LENGTH,ACOUSTIC,POP.,RND,Class
65,Roll Some Mo,Lucky Daye,75,51,40,-9,36,4:53,26,68,8705,0
211,Sacrifices,Drake,139,43,90,-12,56,5:08,5,65,6229,1
443,I Like Me Better,Lauv,92,51,75,-8,42,3:17,54,85,8231,1
226,SOSO,WINNER,116,85,71,-4,73,3:18,42,68,4335,0
648,Jesie?,Aleksy Wysocki,148,2,41,-26,50,1:59,100,60,929,0


## Data Cleaning and Preprocessing

In [88]:
# Check to see if there are any null values in the dataset
combined_data.isnull().any()

TITLE       False
ARTIST      False
BPM         False
ENERGY      False
DANCE       False
LOUD        False
VALENCE     False
LENGTH      False
ACOUSTIC    False
POP.        False
RND         False
Class       False
dtype: bool

In [89]:
# Check if there are any duplicate entries
any_duplicate = any(combined_data['TITLE'].duplicated())
any_duplicate

True

In [90]:
# Drop duplicate values
combined_data.drop_duplicates(subset = 'TITLE', keep = False, inplace = True)
any_duplicate = any(combined_data['TITLE'].duplicated())
any_duplicate

False

In [95]:
combined_data.dtypes

TITLE       object
ARTIST      object
BPM          int64
ENERGY       int64
DANCE        int64
LOUD         int64
VALENCE      int64
LENGTH      object
ACOUSTIC     int64
POP.         int64
RND          int64
Class        int64
dtype: object

In [99]:
combined_data.dtypes

TITLE               object
ARTIST              object
BPM                  int64
ENERGY               int64
DANCE                int64
LOUD                 int64
VALENCE              int64
LENGTH      datetime64[ns]
ACOUSTIC             int64
POP.                 int64
RND                  int64
Class                int64
dtype: object

In [117]:
# Convert duration from Min:Sec to just seconds
combined_data['LENGTH'] = pd.to_datetime(combined_data['LENGTH'])
combined_data['DURATION'] = combined_data['LENGTH'].dt.hour*60 + combined_data['LENGTH'].dt.minute
combined_data.drop(['LENGTH', 'RND'], inplace = True, axis = 1)

In [162]:
combined_data.sample(5)

Unnamed: 0,TITLE,ARTIST,BPM,ENERGY,DANCE,LOUD,VALENCE,ACOUSTIC,POP,DURATION,Class
475,Forrester,Adam Fincher,70,1,40,-24,29,100,59,146,0
440,Close To Me (with Diplo) (feat. Swae Lee),Ellie Goulding,144,76,57,-5,49,10,84,183,1
45,wokeuplikethis*,Playboi Carti,78,62,79,-7,48,1,76,236,1
395,fabienne,Klaus Weissmuller,134,6,40,-23,24,100,68,116,0
422,Hate Me (with Juice WRLD),Ellie Goulding,75,77,66,-5,76,11,89,186,1
