In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from sklearn.metrics import f1_score
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import BorderlineSMOTE

from scipy import stats


%matplotlib inline

In [3]:
combined = pd.read_csv('./wordBag.csv', sep = '\t')
data = combined.drop_duplicates(['lyrics'])

In [8]:
class popularityModel(object):
    """
    basically is returning the Mode
    """
    def __init__(self):
        self.mode = None
        return 
    
    def fit(self, X, y):
        """
        X: (number of examples k, number of features d.) nd array.
        y: (number of examples, ) nd array, or panda.series.
        """
        if X.shape[0] != y.shape[0]:
            print("error, dim not match")
            return
        self.mode = stats.mode(y.values, nan_policy='omit')[0][0]
        
    def predict(self, X):
        """
        X: (number of examples k_1, number of features d.) nd array.
        """
        if self.mode == None:
            print("Please fit the model first.")
            return
        
        return np.array([self.mode] * X.shape[0])
        
    

## classification with top 10 artists

First try popularity model which is our baseline model and try SVM to see if the model works.

In [4]:
artist_10 = data.artist.value_counts()[:10]
data_10 = data[data.artist.isin(artist_10.index)]

In [6]:
# split the data into training and test dataset
train, test = train_test_split(data_10, test_size=0.2, random_state = 124, stratify = data_10.artist)
corpus = train['wordBag'].values.astype(str).tolist()
corpus_test = test['wordBag'].values.astype(str).tolist()

In [9]:
# initial pre-processing and classifier
vectorizer = CountVectorizer(max_features=5000)
popuralityMod = popularityModel()

In [28]:
# train the popularity model
X = vectorizer.transform(corpus)
y = train.artist
popuralityMod.fit(X, y)



In [29]:
# get the F1 score of the popularity model
X_test = vectorizer.transform(corpus_test)
y_pred = popuralityMod.predict(X_test)
y_true = test['artist']

f1_score(y_true, y_pred, average = 'macro') 

  'precision', 'predicted', average, warn_for)


0.023020408163265303

The popularity model gives f1 score equals to 0.023

Next we try SVM with top 10 artists

In [31]:
clf = LinearSVC(max_iter = 5000)

In [32]:
clf.fit(X, y)
y_pred = clf.predict(X_test)
y_true = test['artist']

f1_score(y_true, y_pred, average = 'macro') 

0.4372951443549836

Using simple linear SVC we got 0.437 F1 socre. To improve the performance we try tfidf and SVD before pass the data to classifer. 

In [61]:
vectorizer_t = TfidfVectorizer(max_features = 550, ngram_range = (1,1))
svd = TruncatedSVD(n_components=200)

In [57]:
svd.fit(X)
X_d = svd.transform(X)
X_test_d = svd.transform(X_test)

In [58]:
clf.fit(X_d, y)
y_pred = clf.predict(X_test_d)
y_true = test['artist']

f1_score(y_true, y_pred, average = 'macro') 

KeyboardInterrupt: 

The result from SVM with SVD increase the F1 score to 0.444

We try tf-idf to see if it could give better results.

In [62]:
vectorizer_t.fit(corpus)
X_t = vectorizer_t.transform(corpus)
X_test_t = vectorizer_t.transform(corpus_test)

In [63]:
clf.fit(X_t, y)
y_pred = clf.predict(X_test_t)


f1_score(y_true, y_pred, average = 'macro') 

0.43886497740246905

The tf-idf seems doesn't work well for our dataset, because that we have already remove the stop words in the dataset. Or the number of components is too small for our model.

We try tf-idf with svd to see if svd could help improve the performance.

In [54]:
svd.fit(X_t)
X_d = svd.transform(X_t)
X_test_d = svd.transform(X_test_t)

In [55]:
clf.fit(X_d, y)
y_pred = clf.predict(X_test_d)


f1_score(y_true, y_pred, average = 'macro') 

0.4180033829454892

With SVD we get worse F1 score, which may also result from the mismatch of the number of components of two function.