# Modèle de prédiction du genre 

Cette préparation des données s'est fortement appuyé sur les cours de Ricco Rakotomalala <ricco.rakotomalala@univ-lyon2.fr> et des travaux de Javed Shaikh (https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a) sur la document classification sur python avec la bibliothèque sklearn

In [240]:
# Import des packages 
import pandas as pd


In [241]:
# Import des données sous format csv
df = pd.read_csv('df.csv')

In [242]:
# Liste des genres supprimés après analyse des précisions
list_del = ['pop', 'country', 'folk', 'disco', 'r-b', 'techno', 'chill', 'indie', 'rock', 'blues', 'classical-music']

In [310]:
len(list_del)

11

In [244]:
# Enlève les genres de la liste dans le dataframe
df_t = df[~df['genre'].isin(list_del)]

In [245]:
# Liste des genres conservés pour le modèle
df_t.genre.unique()

array(['electro', 'metal', 'reggae', 'trap', 'k-pop', 'gospel', 'hip-hop',
       'jazz'], dtype=object)

In [246]:
# Nécessité de reset les index pour le nettoyage des données 
df_t = df_t.reset_index(drop=True)

In [247]:
# Affichage des données extraites
df_t.head(10)

Unnamed: 0,lyrics,genre
0,"White shirt now red, my bloody nose\nSleepin',...",electro
1,Somethin' must've gone wrong in my brain\nGot ...,electro
2,It was great at the very start\nHands on each ...,electro
3,"Oh, she's sweet, but a psycho\nA little bit ps...",electro
4,If you don't wanna see me\n\nDid a full one-ei...,electro
5,"Everybody gets high sometimes, you know\nWhat ...",electro
6,You call me all friendly\nTellin' me how much ...,electro
7,This ain't for the best\nMy reputation's never...,electro
8,"Do you recall, not long ago\nWe would walk on ...",electro
9,Been sitting eyes wide open behind these four ...,electro


In [248]:
# Taille du df, 13476 lignes soit autant de musiques avec deux colonnes
# Une pour les paroles (lyrics) et une pour le genre 
df_t.shape

(6307, 2)

In [249]:
len(df_t.lyrics)

6307

In [250]:
df_t.genre[1001]

'metal'

In [251]:
# Enlève la partie embed venant du scrapping de genius
for i in range(0,len(df_t.lyrics)) :
  df_t.lyrics[i] = df_t.lyrics[i].replace('EmbedShare URLCopyEmbedCopy','')

In [252]:
# Enlève les sauts de lignes venant du scrapping de genius
for i in range(0,len(df_t.lyrics)) :
  df_t.lyrics[i] = df_t.lyrics[i].replace('\n',' ')

In [253]:
df_t.head(2)

Unnamed: 0,lyrics,genre
0,"White shirt now red, my bloody nose Sleepin', ...",electro
1,Somethin' must've gone wrong in my brain Got y...,electro


##Nettoyage du corpus

In [254]:
# Passage sous forme de liste
corpus = df_t.lyrics.tolist()
#corpus[0]

In [255]:
# Passage en minuscule
corpus = [doc.lower() for doc in corpus]
#corpus[0]

In [256]:
# Liste des ponctuations 
import string
ponctuations = list(string.punctuation)
#ponctuations

In [257]:
# Retrait des ponctuations
corpus = ["".join([char for char in list(doc) if not (char in ponctuations)]) for doc in corpus]
#corpus[0]

In [258]:
import re

In [259]:
# Retrait des nombres
corpus = [re.sub(r'\d+', '', doc) for doc in corpus]
#corpus[0]

In [260]:
import nltk
# Choix d'installation directe
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [261]:
from nltk.tokenize import word_tokenize
corpus_tk = [word_tokenize(doc) for doc in corpus]
#corpus_tk[0]

In [262]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [263]:
# Lemmatisation , pas sûr si c'est important dans les paroles d'une chanson
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
corpus_lem = [[lem.lemmatize(mot) for mot in doc] for doc in corpus_tk]
#corpus_lem[0]

In [264]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [265]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [266]:
corpus_st = [[mot for mot in doc if not mot in stop_words] for doc in corpus_lem]
#corpus_st[0]

In [267]:
corpus_st = [[mot for mot in doc if len(mot) >=3] for doc in corpus_st]
#corpus_st[0]

In [268]:
dfa = df_t
for i in range(0, len(corpus_lem)):
  dfa['lyrics'][i] = ' '.join(corpus_st[i])
  dfa['genre'][i] = df_t.genre[i]

In [269]:
dfa.head(3)

Unnamed: 0,lyrics,genre
0,white shirt red bloody nose sleepin youre tipp...,electro
1,somethin mustve gone wrong brain got chemical ...,electro
2,great start hand couldnt stand far apart close...,electro


In [270]:
import numpy as np
from sklearn.model_selection import train_test_split

In [319]:
import sklearn
print(sklearn.__version__)

1.0.1


In [311]:
# Préparation des échantillons d'apprentissage, test avec les variables cibles
Xa_train, Xa_test, ya_train, ya_test = train_test_split(dfa.lyrics, dfa.genre, test_size=0.3, random_state=0, stratify=dfa.genre)

In [272]:
ya_train.shape

(4414,)

In [273]:
ya_test.shape

(1893,)

In [274]:
# Convertit le fichier texte en une matrice de décompte des mots 
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
Xa_train_counts = count_vect.fit_transform(Xa_train)
Xa_train_counts.shape

(4414, 61744)

In [275]:
# Réduit le poids 
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
Xa_train_tfidf = tfidf_transformer.fit_transform(Xa_train_counts)
Xa_train_tfidf.shape

(4414, 61744)

In [276]:
from sklearn.pipeline import Pipeline

## Modèle avec l'algorithme Support Vector Machines (SVM)

In [277]:
from sklearn.linear_model import SGDClassifier

In [282]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=0)),
])

In [283]:
text_clf_svm.fit(Xa_train, ya_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf-svm', SGDClassifier(alpha=0.001, random_state=0))])

In [280]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [284]:
predict_svm = text_clf_svm.predict(Xa_test)
print(accuracy_score(ya_test, predict_svm))
print(classification_report(ya_test, predict_svm))

0.5927099841521395
              precision    recall  f1-score   support

     electro       0.53      0.37      0.44       285
      gospel       0.68      0.67      0.68       209
     hip-hop       0.57      0.62      0.60       305
        jazz       0.56      0.53      0.54       291
       k-pop       0.72      0.59      0.65        90
       metal       0.61      0.63      0.62       239
      reggae       0.63      0.68      0.65       240
        trap       0.55      0.71      0.62       234

    accuracy                           0.59      1893
   macro avg       0.61      0.60      0.60      1893
weighted avg       0.59      0.59      0.59      1893



## Extraction et sauvegarde du modèle final

In [293]:
import pickle

In [314]:
print(pickle.format_version)

4.0


In [294]:
filename = 'model-svm.sav'

In [295]:
pickle.dump(text_clf_svm, open(filename, 'wb'))

In [296]:
loaded_model = pickle.load(open(filename, 'rb'))

In [320]:
res = loaded_model.predict([df_t.lyrics[1]])

In [321]:
res[0]

array(['electro'], dtype='<U7')