# Ratings Predictor - Project Group 6
Gillian Foster, Tristan Hsieh, Yulissa Montes, Ezra Rebollar

April 19, 2019 

50395

In [1]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

### Step One: Organizing the Data

The data collected from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TVYEGI is divided into six CSV files. Here, the data is collected into a single set and widdled into the relevant features.

In [2]:
attributes = pd.read_csv("attributes.csv")
print(attributes.isnull().sum())
ratings_mean = pd.read_csv("ratings_mean.csv")
print(ratings_mean.isnull().sum())
ratings_all = pd.read_csv("ratings_all.csv")
print(ratings_all.isnull().sum())
track_info = pd.read_csv("trackinfo.csv")
print(track_info.isnull().sum())

AttributeID      0
AttributeName    0
dtype: int64
TrackID        0
Valence        0
Arousal        0
Tension        0
Atmospheric    0
Happy          0
Dark           0
Sad            0
Angry          0
Sensual        0
Sentimental    0
dtype: int64
UserID         0
TrackID        0
Rating         0
AttributeID    0
dtype: int64
TrackID         0
LastfmID        0
Artist          0
Title           0
URL             0
SpotifyURL      0
ClipStart(s)    0
ClipEnd(s)      0
dtype: int64


There is no null data in the selected dataset.

Some data, like clip start and end times, is more useful when consolidated. ClipLength and TitleLength are both derived features.

In [3]:
track_info['TitleLength'] = track_info['Title'].str.len()
track_info['ClipLength'] = track_info["ClipEnd(s)"] - track_info["ClipStart(s)"]
track_info = track_info.drop(['LastfmID', 'URL', 'SpotifyURL','ClipStart(s)', 'ClipEnd(s)', 'Title'], axis=1)
track_info.head(10)

Unnamed: 0,TrackID,Artist,TitleLength,ClipLength
0,1,Autechre,4,14.875
1,2,Lalo Schifrin,23,29.925
2,3,Daft Punk,11,14.875
3,4,Portishead,10,29.925
4,5,The Rolling Stones,13,29.925
5,6,Tindersticks,16,29.925
6,7,FischerSpooner,16,14.875
7,8,Horace Silver,23,29.95
8,9,Janet Jackson,24,14.875
9,10,Chet Baker,25,29.925


The AttributeIDs in ratings_all are replaced with the referenced Attribute.

In [4]:
attributes = attributes.set_index('AttributeID')
attributesDict = attributes.to_dict()
attributesDict = attributesDict['AttributeName']

In [5]:
ratings_all = ratings_all.replace({'AttributeID': attributesDict})
ratings_all = ratings_all.rename(columns={'AttributeID' : 'Attribute'})
ratings_all.head(10)

Unnamed: 0,UserID,TrackID,Rating,Attribute
0,1,272,6,Happy
1,1,272,2,Dark
2,1,272,3,Sad
3,1,272,2,Angry
4,1,272,7,Sensual
5,1,272,6,Sentimental
6,1,272,4,Arousal
7,1,272,6,Valence
8,1,272,3,Tension
9,1,272,7,Atmospheric


In [6]:
data_ratings = ratings_all.merge(track_info, on='TrackID')
data_ratings.head(15)

Unnamed: 0,UserID,TrackID,Rating,Attribute,Artist,TitleLength,ClipLength
0,1,272,6,Happy,Efterklang,8,14.875
1,1,272,2,Dark,Efterklang,8,14.875
2,1,272,3,Sad,Efterklang,8,14.875
3,1,272,2,Angry,Efterklang,8,14.875
4,1,272,7,Sensual,Efterklang,8,14.875
5,1,272,6,Sentimental,Efterklang,8,14.875
6,1,272,4,Arousal,Efterklang,8,14.875
7,1,272,6,Valence,Efterklang,8,14.875
8,1,272,3,Tension,Efterklang,8,14.875
9,1,272,7,Atmospheric,Efterklang,8,14.875


In [7]:
data_means = track_info.merge(ratings_mean, on='TrackID')
data_means = data_means.drop(['TrackID'], axis=1)
data_means.head(10)

Unnamed: 0,Artist,TitleLength,ClipLength,Valence,Arousal,Tension,Atmospheric,Happy,Dark,Sad,Angry,Sensual,Sentimental
0,Autechre,4,14.875,3.7,5.2414,7.3,4.6667,2.4667,5.7333,3.069,4.5172,2.2667,1.8
1,Lalo Schifrin,23,29.925,7.6786,6.9286,3.75,4.6786,7.1429,1.4074,1.8929,1.3929,2.4286,2.8571
2,Daft Punk,11,14.875,6.6897,2.1333,2.6,7.2,4.8966,2.2,3.0345,1.2333,5.1333,4.1
3,Portishead,10,29.925,4.1,3.9333,5.1667,6.4667,2.4333,5.3333,5.9,2.8621,6.0,5.6333
4,The Rolling Stones,13,29.925,6.0345,3.8276,4.3793,5.4483,4.6667,2.2222,2.6897,2.6897,3.6786,4.1071
5,Tindersticks,16,29.925,5.8148,3.0714,3.5357,6.3793,3.8276,3.0,5.3793,1.6897,4.6552,5.7586
6,FischerSpooner,16,14.875,4.4828,6.2,6.4667,4.0345,3.4138,4.3667,3.7,3.3333,2.3793,2.6667
7,Horace Silver,23,29.95,5.931,5.4138,4.5517,5.6207,4.3214,2.0741,4.0,1.8077,3.7931,3.3929
8,Janet Jackson,24,14.875,6.931,7.25,4.1786,4.6207,6.5517,1.6552,2.4138,1.6552,4.5,3.9286
9,Chet Baker,25,29.925,5.3929,1.9643,2.8276,7.2069,2.7586,3.4138,5.7586,1.5862,6.0345,5.7586


### Step Two: Exploring the Data

Now combined into a single set, this explores some patterns in the data.

In [8]:
# data describe and dimensions
print("data_means described: " + "\n" + str(data_means.describe()))
print()
print("dimensions of data_means:" + "\n" + "number of rows: " + str(data_means.shape[0]) + "\n" + "number of columns: " + str(data_means.shape[1]))
print()

data_means described: 
       TitleLength  ClipLength     Valence     Arousal     Tension  \
count   600.000000  600.000000  600.000000  600.000000  600.000000   
mean     14.913333   26.341208    5.436206    5.010788    4.771573   
std       7.928888    6.413236    1.195235    1.898004    1.453082   
min       3.000000   14.875000    1.866700    1.600000    2.069000   
25%      10.000000   29.925000    4.740700    3.178600    3.516100   
50%      14.000000   29.925000    5.631800    5.086200    4.638100   
75%      18.000000   29.925000    6.250000    6.720125    5.873250   
max      60.000000   30.000000    8.000000    8.551700    8.400000   

       Atmospheric       Happy        Dark         Sad       Angry  \
count   600.000000  600.000000  600.000000  600.000000  600.000000   
mean      5.248452    4.037959    3.452491    3.959013    2.772385   
std       1.424241    1.411678    1.486005    1.268322    1.588414   
min       1.928600    1.448300    1.137900    1.321400    1.192300

In [9]:
# correlation coefiicient for all moods against "Sentimental"
print("correlation coefficient for Sentimental vs. Atmospheric:", data_means['Sentimental'].corr(data_means['Atmospheric']))
print("correlation coefficient for Sentimental vs. Sad:", data_means['Sentimental'].corr(data_means['Sad']))
print("correlation coefficient for Sentimental vs. Arousal:", data_means['Sentimental'].corr(data_means['Arousal']))
print("correlation coefficient for Sentimental vs. Sensual:", data_means['Sentimental'].corr(data_means['Sensual']))
print("correlation coefficient for Sentimental vs. Tension:", data_means['Sentimental'].corr(data_means['Tension']))
print("correlation coefficient for Sentimental vs. Angry:", data_means['Sentimental'].corr(data_means['Angry']))
print("correlation coefficient for Sentimental vs. Dark:", data_means['Sentimental'].corr(data_means['Dark']))
print("correlation coefficient for Sentimental vs. Valence:", data_means['Sentimental'].corr(data_means['Valence']))
print("correlation coefficient for Sentimental vs. Happy:", data_means['Sentimental'].corr(data_means['Happy']))


correlation coefficient for Sentimental vs. Atmospheric: 0.7544399704393624
correlation coefficient for Sentimental vs. Sad: 0.7344198936886337
correlation coefficient for Sentimental vs. Arousal: -0.7146938692232553
correlation coefficient for Sentimental vs. Sensual: 0.7008375734092313
correlation coefficient for Sentimental vs. Tension: -0.6185503609410423
correlation coefficient for Sentimental vs. Angry: -0.5125949063539434
correlation coefficient for Sentimental vs. Dark: -0.1994732641335074
correlation coefficient for Sentimental vs. Valence: 0.17219959538823057
correlation coefficient for Sentimental vs. Happy: -0.08270548442427043


In [10]:
from sklearn.model_selection import train_test_split
# all data except "Sentimental"
data_features = data_means.drop(["Sentimental"], axis=1)

# should include "Sentimental"
data_class = data_means["Sentimental"]

# split data into training and test set (proportions: 80 and 20)
features_train, features_test, class_train, class_test = train_test_split(data_features, data_class, test_size=0.20)
print(type(features_train))
print("features_train=",features_train.shape, " class_train=", class_train.shape)
print("features_test=",features_test.shape, " class_test=", class_test.shape)
print(pd.DataFrame(class_test))
print("class test:", class_test)

<class 'pandas.core.frame.DataFrame'>
features_train= (480, 12)  class_train= (480,)
features_test= (120, 12)  class_test= (120,)
     Sentimental
63        2.5926
520       3.1724
103       4.6897
578       6.2857
240       5.4667
..           ...
56        3.7000
75        6.4333
450       3.7500
62        3.7333
203       5.2333

[120 rows x 1 columns]
class test: 63     2.5926
520    3.1724
103    4.6897
578    6.2857
240    5.4667
        ...  
56     3.7000
75     6.4333
450    3.7500
62     3.7333
203    5.2333
Name: Sentimental, Length: 120, dtype: float64


In [None]:
# get bins for model 
########## TBD ################
# df = data_means.groupby(["Valence", "Arousal", "Tension", "Atmospheric", "Happy", "Dark", "Sad", "Angry", "Sensual"]).sum().reset_index()
# print(df.head())
# print("describe:", df.describe())
# print()
#pd.qcut(df, q=3)