<h2><b>Generate mood labels for the main dataset</b></h2>

<h4>Import necessary libraries</h4>

In [13]:
import sklearn
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from matplotlib import pyplot as plt

<h3>Load Labelled data</h3>

In [14]:
data = pd.read_csv('data_moods.csv')
data.columns

Index(['name', 'album', 'artist', 'id', 'release_date', 'popularity', 'length',
       'danceability', 'acousticness', 'energy', 'instrumentalness',
       'liveness', 'valence', 'loudness', 'speechiness', 'tempo', 'key',
       'time_signature', 'mood'],
      dtype='object')

In [69]:
X = data.select_dtypes('number')
y = data['mood']

<h3>Select Number of features</h3>

In [70]:
import os

def gen_models(model_name):
	"""
	input: a model name
	output: a dictionary of models with number of features in range [5,9]
	"""
	models = dict()
	for i in range(2, 10):
		rfe = RFE(estimator=model_name(), n_features_to_select=i)
		model = DecisionTreeClassifier()
		models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
	return models



def evaluate_model(model, X, y):
    """
    Runs 3 fold cross validation using provided model, X and y
    Returns scores
    """
    cross_validation = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cross_validation, n_jobs=-1, error_score='raise')
    return scores



def plot_model_comparisons(model_names):
    """
    input: array of model names as defined in sklearn library
    output: plots a 2X2 grid of plots, so input array length is expected to be 4
    """
    assert(len(model_names) == 4)
    fig, axs = plt.subplots(2, 2, figsize=(10, 8))
    axs = axs.flatten()

    for i, ax in enumerate(axs):
        model_name = model_names[i]
        models = gen_models(model_name)
        results, names = [], []
        for name, model in models.items():
            scores = evaluate_model(model, X, y)
            results.append(scores)
            names.append(name)
            print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
        
        ax.boxplot(results, labels=names, showmeans=True)
        ax.set_title(str(model_name).split('.')[-1][:-2])

    plt.tight_layout()
    plt.savefig(os.path.join(os.getcwd(), 'model_comparisons.png'))

In [71]:
%%capture

import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress ConvergenceWarnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)

model_names = [DecisionTreeClassifier, LogisticRegression, Perceptron, GradientBoostingClassifier]
plot_model_comparisons(model_names=model_names)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
big_data = pd.read_csv('data.csv')
big_data.columns

  big_data = pd.read_csv('data.csv')


Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'mood_prediction'],
      dtype='object')

In [53]:
big_data.select_dtypes('number').columns

Index(['valence', 'year', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'speechiness', 'tempo'],
      dtype='object')

<h3>Loading data again</h3>

In [16]:
X = data.select_dtypes('number')
y = data['mood']
X = X.drop(columns=['length', 'time_signature'])

In [17]:
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=9)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
pipeline.fit(X, y)

In [18]:
chosen_features = ['popularity','danceability', 'acousticness', 'energy',
       'instrumentalness', 'liveness', 'valence', 'loudness', 'speechiness',
       'tempo', 'key']

predictions = pipeline.predict(big_data[chosen_features])

# Create a new column in df_unlabeled to store the predictions
big_data['predicted_mood'] = predictions

In [19]:
big_data.predicted_mood.unique()

array(['Calm', 'Sad', 'Happy', 'Energetic'], dtype=object)

In [20]:
big_data.shape

(170653, 21)

In [21]:
big_data.to_csv('data.csv', index=False)

In [23]:
big_data['predicted_mood'].value_counts()

predicted_mood
Sad          81728
Happy        47404
Energetic    21457
Calm         20064
Name: count, dtype: int64

In [35]:
def predict_mood(model, features_list):
    """
    input: fitted pipeline object, and features list (much match with the training dataset)
    output: features_list with mood added
    """
    features_list = [features_list]
    yhat = model.predict(features_list)
    features_list[0].append(yhat.item())

    return features_list[0]

<h3>How to use the predict_mood function</h3>

In [5]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [24]:
# load the data
main_df = pd.read_csv('data.csv')
main_df.columns

  main_df = pd.read_csv('data.csv')


Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'mood_prediction', 'predicted_mood'],
      dtype='object')

In [27]:
main_df.predicted_mood.value_counts()

predicted_mood
Sad          81728
Happy        47404
Energetic    21457
Calm         20064
Name: count, dtype: int64

In [28]:
main_df.shape

(170653, 21)

In [29]:
df_no_nan = main_df.dropna()
df_no_nan.shape

(686, 21)

In [30]:
chosen_features = ['danceability', 'acousticness', 'energy',
       'instrumentalness', 'liveness', 'valence', 'loudness', 'speechiness',
       'tempo']

In [31]:
X_train = main_df[chosen_features]
y_train = main_df['predicted_mood']

In [32]:
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=9)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
pipeline.fit(X_train, y_train)

In [34]:
x = [[0.5, 0.6, 0.78, 0.3, 0.9, 0.2, 0.4, 0.1, 0.2]]
pipeline.predict(x).item()



'Happy'

In [38]:
x_withMood = predict_mood(pipeline, x[0])
x_withMood



[0.5, 0.6, 0.78, 0.3, 0.9, 0.2, 0.4, 0.1, 0.2, 'Happy']