In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
import string
import re
nltk.download('wordnet')

In [33]:
# Define NLP functions

def tokenize(s):
    s = "".join(" " if x in string.punctuation else x for x in s.lower())
    return s.split()

# Lemmatize and Remove Stop words
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer("english")

import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

for s in string.punctuation:
    stop_words.append(s)

for s in '1234567890':
    stop_words.append(s)

def lemmatize(sentence):
    return [lemmatizer.lemmatize(word) for word in sentence]

def stemming(sentence):
    #return [ps.stem(word) for word in sentence]
    #return [ls.stem(word) for word in sentence]
    return [ss.stem(word) for word in sentence]
    #return [stem(word) for word in sentence]

def remove_stopwords(sentence):
    return [word for word in sentence if word not in stop_words]

def remove_low_freq_words(sentence):
    return [word for word in sentence if word not in low_freq_words]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
from google.colab import files
uploaded = files.upload()

Saving train.tsv to train (1).tsv


In [35]:
import pandas as pd
import io
df = pd.read_csv(io.StringIO(uploaded['train.tsv'].decode('utf-8')), sep='\t', header=None, names=["genre","lyrics"], index_col=False, dtype=str)
df.head()

Unnamed: 0,genre,lyrics
0,1000000,Waters running down By the silver moon rays To...
1,10000000,Churches should be there I think of adventures...
2,10000,My head in reverse Mind controlled Align wires...
3,10000000,The lost generations taking hold The last gene...
4,1000000,And shes dreaming about The view over the cast...


In [36]:
# Create train / test data
train, test = train_test_split(df, test_size=0.2, stratify=df["genre"])

# Preprocess train and test data
train["lyrics"] = train["lyrics"].apply(str.lower)
train["lyrics"] = train['lyrics'].apply(tokenize)
train["lyrics"] = train['lyrics'].apply(lemmatize)
train["lyrics"] = train['lyrics'].apply(remove_stopwords)
train = train[train["lyrics"].str.len() >= 200]

test["lyrics"] = test["lyrics"].apply(str.lower)
test["lyrics"] = test['lyrics'].apply(tokenize)
test["lyrics"] = test['lyrics'].apply(lemmatize)
test["lyrics"] = test['lyrics'].apply(remove_stopwords)
test = test[test["lyrics"].str.len() >= 200]

train_tagged = train.apply(lambda df: TaggedDocument(words=df.lyrics, tags=df.genre), axis=1)
test_tagged = test.apply(lambda df: TaggedDocument(words=df.lyrics, tags=df.genre), axis=1)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

In [37]:
import multiprocessing
cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=5, sample=1e-5, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

for epoch in range(20):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.025
    model_dbow.min_alpha = model_dbow.alpha

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags, model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

100%|██████████| 24050/24050 [00:00<00:00, 1120412.87it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1542683.84it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1890069.54it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2086092.67it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2072676.32it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2028250.52it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2050431.16it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1569201.98it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1693039.92it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1576239.31it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2057834.95it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2237892.65it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2058338.83it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1566448.40it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1626642.98it/s]
100%|██████████| 24050/24050 [00:00<00:00, 2128436.93it/s]
100%|██████████| 24050/24050 [00:00<00:00, 1697484.41it/

In [0]:
# Obtain genre vector from X_train, y_train

results = dict()
cnt = dict()

for l in zip(X_train, y_train):

    if l[1] in results.keys():
        results[l[1]] += l[0]
        cnt[l[1]] += 1
    else:
        results.update({l[1]: l[0]})
        cnt[l[1]] = 1

for k in results.keys():
    results[k] = results[k] / cnt[k]


# prediction for X_test
from scipy import linalg, mat, dot

y_pred = list()

for x in X_test:

    a = mat(x)
    cos_sim = dict()

    # calculate cosine similarity for each genre vector
    for k in results.keys():
        b = mat(results[k])
        # cos_sim.update({k: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))})
        cos_sim.update({k: dot(a, b.T) / linalg.norm(a) / linalg.norm(b)})

    # Append to prediction results
    y_pred.append(min(cos_sim, key=cos_sim.get))

In [0]:
# knn classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [42]:
# Calculate Metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

print("accuracy_score = ", accuracy_score(y_pred, y_test))
print("recall_score = ", recall_score(y_pred, y_test, average='weighted'))
print("precision_score = ", precision_score(y_pred, y_test, average='weighted'))
print("f1_score = ", f1_score(y_pred, y_test, average='weighted'))
print("confusion_matrix = \n", confusion_matrix(y_pred, y_test))

accuracy_score =  0.39389463318562284
recall_score =  0.39389463318562284
precision_score =  0.5268124071539985
f1_score =  0.446844701722581
confusion_matrix = 
 [[   1    0    0    2    1    1    2   15   17    5]
 [   1    1    1    4    2    1    1   11   29   11]
 [   1    0    0    1    1    0    0    3    8    4]
 [   0    4    1    8   12    3    9   39  117   34]
 [   1    0    0    3    6    5    1   18   39   12]
 [   1    1    0    1    1    0    0    8   11    2]
 [   3    1    1    7    5    3    0   16   43    6]
 [  13   28    6   55   44   15   31  179  487  154]
 [  44   85   16  182  135   64  172  751 2157  652]
 [   2    2    1   17    7    5    6   56  133   48]]


In [40]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)

ValueError: ignored

In [31]:
# Calculate Metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

print("accuracy_score = ", accuracy_score(y_pred, y_test))
print("recall_score = ", recall_score(y_pred, y_test))
print("precision_score = ", precision_score(y_pred, y_test))
print("f1_score = ", f1_score(y_pred, y_test))
print("confusion_matrix = \n", confusion_matrix(y_pred, y_test))

Testing accuracy 0.4433833560709413
Testing F1 score: 0.3663808632163094


  'precision', 'predicted', average, warn_for)
