In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

%matplotlib inline

In [2]:
df = pd.read_excel('generos_musicais.xlsx')

In [3]:
df.head()

Unnamed: 0,lyric,genre
0,Percebo que o tempo já não passa\nVocê diz que...,1
1,"Eu te dei o ouro do sol, a prata da lua\nTe de...",1
2,"Fada.., fada querida\nDona... da minha vida\nV...",1
3,Moro num lugar\nNuma casinha inocente do sertã...,1
4,Nunca vi ninguém viver tão feliz\nComo eu no s...,1


In [4]:
df['genre'].value_counts()

5    42
4    42
3    42
2    42
1    42
Name: genre, dtype: int64

In [6]:
X = df['lyric']
y = df['genre']

In [7]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [8]:
stopwords = nltk.corpus.stopwords.words('portuguese')

In [9]:
cv = TfidfVectorizer(stop_words=stopwords, preprocessor=clean_text)

In [10]:
X = cv.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=42)

In [12]:
#alterado o random_state para 42 ganho de 10% de precisão

In [13]:
nb = MultinomialNB()

In [14]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
predictions = nb.predict(X_test)

In [16]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.64      0.64      0.64        14
           2       0.47      0.69      0.56        13
           3       1.00      0.31      0.48        16
           4       0.61      1.00      0.76        11
           5       1.00      0.88      0.93        16

   micro avg       0.69      0.69      0.69        70
   macro avg       0.75      0.70      0.67        70
weighted avg       0.77      0.69      0.67        70



In [17]:
print(confusion_matrix(y_test, predictions))

[[ 9  4  0  1  0]
 [ 2  9  0  2  0]
 [ 3  4  5  4  0]
 [ 0  0  0 11  0]
 [ 0  2  0  0 14]]


In [18]:
print(accuracy_score(y_test, predictions))

0.6857142857142857


In [19]:
test_predict = pd.read_excel('test_generos_musicais.xlsx')
Z = cv.transform(test_predict['lyric'])

<class 'pandas.core.series.Series'>


In [20]:
pred = nb.predict(Z)

In [21]:
print(pred)

[1 1 4 1 1 1 5]


In [22]:
filename='modelmusic.sav'
pickle.dump(nb, open(filename, 'wb'))