In [7]:
# Make the necessary imports

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
#import chart_studio.plotly as py


In [8]:
# Read the (training data)

genre_df = pd.read_csv('subgenre.csv')

In [9]:
# A helper dataframe used to make a visualization. It counts the number of albums in a given genre

count_df = genre_df.groupby('genre').count().sort_values(by='url',ascending=False).head(25).reset_index()[['genre','url']]
count_df['count'] = count_df['url']

In [10]:
# Creates a pie chart in Plotly

fig = px.pie(count_df.head(15),values='count',names='genre',hole=.3, labels=count_df['genre'],height=600)
fig.update_layout(title_text = 'Breakdown of Subgenres of Jazz Present in Web Crawl')
fig.update_traces(textinfo='label+percent')
fig.show()
#py.plot(fig,filename=f'jazz-subgrenres',auto_open=True)


In [11]:
# Translates the text of the wikipedia articles for our training data into np vectors using tfidf 

vectorizer = TfidfVectorizer(max_features=1500,stop_words='english',min_df =.05,max_df=.95)
vectors = vectorizer.fit_transform(genre_df['text']) 

In [12]:
# Translates the genres into numerical categories

le = LabelEncoder()
y_encoded = le.fit_transform(genre_df['genre'])

In [13]:
# Specify our training data and training labels for xgboost

data_dmatrix = xgb.DMatrix(data=vectors,label=y_encoded)
params = {'random_state':'42','num_class':len(count_df)}

In [None]:
# Running cross-validation for our XGB Classifier model gives a good idea of our model's accuracy

xgb_cv = xgb.cv(dtrain=data_dmatrix,params=params, nfold=5,metrics='merror')
xgb_cv

In [13]:
model = xgb.XGBClassifier(random_state=42)
model.fit(vectors,y_encoded)


In [2]:
jazz_df = pd.read_csv('jazz.csv')

In [3]:
jazz_df.head()

Unnamed: 0,text,genre
0,Miles Davis and the Modern Jazz Giants - W...,jazz
1,Someday My Prince Will Come (Miles Davis a...,jazz
2,Miles & Monk at Newport - Wikipedia ...,jazz
3,Steamin' with the Miles Davis Quintet - Wi...,jazz
4,'Round About Midnight - Wikipedia ...,jazz


In [33]:
jazz_vectors = vectorizer.transform(jazz_df['text'])

In [34]:
y_pred = model.predict(jazz_vectors)

In [35]:
jazz_df['predicted_genre'] = le.inverse_transform(y_pred)

In [36]:
jazz_df.head()

Unnamed: 0,text,genre,predicted_genre
0,Miles Davis and the Modern Jazz Giants - W...,jazz,jazz fusion
1,Someday My Prince Will Come (Miles Davis a...,jazz,hard bop
2,Miles & Monk at Newport - Wikipedia ...,jazz,jazz fusion
3,Steamin' with the Miles Davis Quintet - Wi...,jazz,modal jazz
4,'Round About Midnight - Wikipedia ...,jazz,hard bop


In [41]:
jazz_count_df  = jazz_df.groupby('predicted_genre').count().sort_values(by='text',ascending=False).head(25).reset_index()[['predicted_genre','text']]
jazz_count_df['count'] = jazz_count_df['text']

In [45]:
# Creates a pie chart in Plotly

fig2 = px.pie(jazz_count_df.head(15),values='count',names='predicted_genre',hole=.3, labels=jazz_count_df['predicted_genre'],height=600)
fig2.update_layout(title_text = 'Breakdown of Predicted Subgenres of Jazz Present in Web Crawl')
fig2.update_traces(textinfo='label+percent')
fig2.show()
#py.plot(fig2,filename=f'jazz-predicted-subgrenres',auto_open=True)


In [47]:
jazz_df[jazz_df['predicted_genre']=='vocal jazz'].head(50)

Unnamed: 0,text,genre,predicted_genre
36,Tenor Conclave - Wikipedia ...,jazz,vocal jazz
80,Lift Every Voice (Andrew Hill album) - Wik...,jazz,vocal jazz
98,Dizzy Gillespie at Newport - Wikipedia ...,jazz,vocal jazz
138,The Quintet (album) - Wikipedia ...,jazz,vocal jazz
152,Ella Fitzgerald and Billie Holiday at Newp...,jazz,vocal jazz
155,Sunshine of Your Love (album) - Wikipedia ...,jazz,vocal jazz
161,Newport Jazz Festival: Live at Carnegie Ha...,jazz,vocal jazz
162,Echoes of an Era - Wikipedia ...,jazz,vocal jazz
163,At the Opera House - Wikipedia ...,jazz,vocal jazz
168,Montreux '75 - Wikipedia ...,jazz,vocal jazz
