In [44]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy
data=pd.read_excel("List_Songs-HIPPOP.xlsx")
data2=pd.concat([data, pd.get_dummies(data['Genre'])], axis=1)

In [45]:
#Remeber that vi importet Spacy (also EN version) in terminal here!
#Here vi will create our tokenizer

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


#Making the data readable for the machine to learn
# Punctuation marks list
punctuations = string.punctuation

# English tokenizer
parser = English()

# Stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Tokenizer function
def spacy_tokenizer(sentence):
    # Our token object:
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return list of tokens
    return mytokens

In [46]:
#We transform our text to get rid of useless information that might increase uncertainty for the model.

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [47]:
#BAG OF WORDS APPROACH. This vectorizer does not care about differences between documents, just the entire set of tokens.
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [48]:
#This vectorizer knows that it means something, if one word is mentioned multiple time in one "document",
#and takes this into its calculations. It also counts across documents though.
#Good explanation here: https://www.quora.com/What-is-the-difference-between-TfidfVectorizer-and-CountVectorizer-1
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [49]:
#DATA SPECIFIC FROM HEREON OUT:

In [50]:
#Lets look at our data again!
data2.head(2)

Unnamed: 0,Lyrics,Title,Year,Genre,Hip Hop,Pop
1,Uh uh uh\nI just gotta bring it to they attent...,#1,2002,Hip Hop,1,0
2,\n\nChorus (2x):(Mannie Fresh)\nAll the nigga...,#1,2002,Hip Hop,1,0


In [51]:
#We split our data into training and test data.
#We know that our lyrics are the features, and that the genre (hip hop/pop 0/1) is our label.
#Don't mind the "hip hop" label --> this is a dummy for hiphop/pop
#Maybe remove other column and rename?^

#THIS IS THE PLACE WHERE THE "RANDOMIZATION" TAKES PLACE. NB!

from sklearn.model_selection import train_test_split

ylabels = data2['Hip Hop'] # labels

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [52]:
#Classifier that is able to K-Fold the training proces (logistic regression)
from sklearn.linear_model import LogisticRegressionCV
classifier = LogisticRegressionCV(cv=5)

#Here we have the two different kinds of GENERALIZATIONS# l1=Lasso, l2=Ridge

#(penalty='l1', solver='liblinear')
#(penalty='l2')

# Create pipeline (change the vectorizer, depending on wether we choose TFIDF VECTOR or BOW VECTOR)
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1a2161c1d0>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))])

In [53]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model results
print("Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision:",metrics.precision_score(y_test, predicted))
print("Recall:",metrics.recall_score(y_test, predicted))

Accuracy: 0.8
Precision: 0.8688524590163934
Recall: 0.7464788732394366
