In [1]:
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score
from joblib import dump

In [2]:
def grid_search(model,param_grid):
    grid_search = GridSearchCV(
        estimator=model, 
        param_grid=param_grid, 
        cv=5, 
        return_train_score=True, 
        scoring='accuracy',         # take balanced accuracy score
        n_jobs= -1)
    grid_search.fit(X_train, y_train)
    return grid_search

In [3]:
def pickle_maker(model,name):
     with open(name+'.pickle', 'wb') as file:
        pickle.dump(model, file)

In [4]:
# read in Datasets and drop na 

lyrics_raw = pd.read_csv('./lyrics_300.csv')
lyrics = lyrics_raw.dropna()

In [5]:
# initialize x, y and split them

y = lyrics['Artist']
X = lyrics['Lyric']

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Random Forest

In [6]:
# create model

model_ran = make_pipeline(
    TfidfVectorizer(
        lowercase=True, 
        stop_words='english',
        min_df=10, max_df=0.90, ngram_range=(1, 2)), 
    RandomForestClassifier(max_depth=6, n_estimators=300, bootstrap=True, class_weight='balanced')
)
model_ran.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.9, min_df=10, ngram_range=(1, 2),
                                 stop_words='english')),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight='balanced', max_depth=6,
                                        n_estimators=300))])

In [7]:
#model_ran.get_params()

In [8]:
# grid search

param_grid = {'randomforestclassifier__max_depth': [5,6,7],
              'randomforestclassifier__class_weight':[None,'balanced']}

random_grid = grid_search(model_ran, param_grid)
print(random_grid.best_params_)   

{'randomforestclassifier__class_weight': None, 'randomforestclassifier__max_depth': 7}


In [9]:
random_grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_randomforestclassifier__class_weight', 'param_randomforestclassifier__max_depth', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [10]:
results = pd.DataFrame(random_grid.cv_results_)
results[[ 'mean_fit_time',
          'param_randomforestclassifier__max_depth',
          'mean_train_score', 
          'mean_test_score']].abs()

Unnamed: 0,mean_fit_time,param_randomforestclassifier__max_depth,mean_train_score,mean_test_score
0,2.46626,5,0.906077,0.85116
1,2.66691,6,0.919175,0.871787
2,2.9416,7,0.928651,0.881829
3,3.18592,5,0.892976,0.841131
4,3.49494,6,0.907051,0.854518
5,3.05191,7,0.916527,0.866213


# Logistic Regression

In [11]:
# create and fit the model

model_log = make_pipeline(
    TfidfVectorizer(
        lowercase=True, 
        stop_words='english',
        min_df=10, max_df=0.90, ngram_range=(1, 2)), 
    LogisticRegression(class_weight='balanced', max_iter=10000)
)
model_log.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.9, min_df=10, ngram_range=(1, 2),
                                 stop_words='english')),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=10000))])

In [12]:
# model_log.get_params()

In [13]:
param_grid = {'logisticregression__max_iter': [500,1000],
               'logisticregression__C': [0.01, 0.1, 1, 10]}
logistic_grid = grid_search(model_log, param_grid)
print(logistic_grid.best_params_) 

{'logisticregression__C': 10, 'logisticregression__max_iter': 500}


In [14]:
logistic_grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_logisticregression__C', 'param_logisticregression__max_iter', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [15]:
results = pd.DataFrame(logistic_grid.cv_results_)
results[[ 'mean_fit_time',
          'param_logisticregression__C',
          'param_logisticregression__max_iter',
          'mean_train_score', 
          'mean_test_score']].abs()

Unnamed: 0,mean_fit_time,param_logisticregression__C,param_logisticregression__max_iter,mean_train_score,mean_test_score
0,1.6563,0.01,500,0.886567,0.8322
1,1.90529,0.01,1000,0.886567,0.8322
2,1.93743,0.1,500,0.920011,0.864541
3,1.82549,0.1,1000,0.920011,0.864541
4,2.10113,1.0,500,0.977983,0.910798
5,2.15047,1.0,1000,0.977983,0.910798
6,2.61081,10.0,500,0.998188,0.928639
7,2.50091,10.0,1000,0.998188,0.928639


# Model evaluation and export

In [16]:
# Random forest evaluation

y_pred = model_ran.predict(X_test)
acc = accuracy_score(y_test, y_pred).round(2)
print('The acc for Random Forest is: ' , acc)

The acc for Random Forest is:  0.87


In [17]:
# Logistic Regression evaluation

y_pred = model_log.predict(X_test)
acc = accuracy_score(y_test, y_pred).round(2)
print('The acc for Logistic Regression is: ' , acc)

The acc for Logistic Regression is:  0.92


In [18]:
# export models 

pickle_maker(model_ran,"model_ran")
pickle_maker(model_log,"model_log")      