In [1]:
import os.path
import pandas as pd
import warnings
from sklearn import metrics
warnings.simplefilter('ignore')

In [33]:
ds = pd.read_csv('../data/preprocessed.csv')

In [34]:
ds.shape

(28352, 26)

In [40]:
display(ds.columns)

Index(['Unnamed: 0', 'track_popularity', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'year', 'duration_s',
       'duration_m', 'track_id_encoded', 'track_name_encoded',
       'track_artist_encoded', 'track_album_id_encoded',
       'track_album_name_encoded', 'playlist_genre_encoded'],
      dtype='object')

In [None]:
# прибираю колонки, які дають забагато інформації (так як в назві плейлистів часто вказується жанр пісні, а колонка піджанру прямо пов'язана із жанром пісні)
ds = ds.drop(columns=['playlist_name_encoded',
       'playlist_id_encoded',
       'playlist_subgenre_encoded'])

In [43]:
from sklearn.model_selection import train_test_split

X = ds.drop('playlist_genre_encoded', axis=1)
Y = ds['playlist_genre_encoded']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7)

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=1717)

In [45]:
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [47]:
print('classification report: ', metrics.classification_report(Y_test, Y_pred))

classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00      1502
           1       1.00      1.00      1.00      1242
           2       1.00      1.00      1.00      1512
           3       1.00      1.00      1.00      1313
           4       1.00      1.00      1.00      1573
           5       1.00      1.00      1.00      1364

    accuracy                           1.00      8506
   macro avg       1.00      1.00      1.00      8506
weighted avg       1.00      1.00      1.00      8506



#### Оскільки на цій моделі виникає перенавчання, обираю простішу модель

In [53]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

X = ds.drop('playlist_genre_encoded', axis=1)
Y = ds['playlist_genre_encoded']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7)

abc = AdaBoostClassifier(random_state=1717)

abc.fit(X_train, Y_train)
Y_pred = abc.predict(X_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1428
           1       0.00      0.00      0.00      1267
           2       1.00      1.00      1.00      1581
           3       0.00      0.00      0.00      1307
           4       0.29      1.00      0.45      1599
           5       0.00      0.00      0.00      1324

    accuracy                           0.54      8506
   macro avg       0.38      0.50      0.41      8506
weighted avg       0.41      0.54      0.44      8506



In [17]:
import pickle

In [55]:
def save_model(model, name):
    model_pkl_file = f"../model/{name}.pkl"
    with open(model_pkl_file, "wb") as file:
        pickle.dump(model, file)

def load_model(name):
    model_pkl_file = f"../model/{name}.pkl"
    with open(model_pkl_file, "rb") as file:
        model = pickle.load(file)
        return model

In [56]:
save_model(abc, "AdaBoostClassifier")

In [77]:
model = AdaBoostClassifier() #load_model("AdaBoostClassifier")
model.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': 'deprecated',
 'estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [78]:
%%time
param_grid = {'learning_rate': [0.05, 0.1, 0.2],
            'n_estimators': [50, 100],
            }

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy')
random_search.fit(X, Y)

print(random_search.best_params_)
print(random_search.best_score_)


{'n_estimators': 50, 'learning_rate': 0.05}
0.830067346570021
CPU times: total: 6min 30s
Wall time: 6min 36s


In [23]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid = {'n_estimators': [20, 30, 50],
               'max_features': [0.5, 0.8],
               'min_samples_split': [5, 10, 15],
               'min_samples_leaf': [4, 6, 8],
               'random_state': [17],
               'bootstrap': [False]}

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, 
                    param_grid=param_grid, 
                    scoring='accuracy', 
                    cv=5,
                    verbose = 3,
                    return_train_score=True)

grid.fit(X_train, Y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [26]:
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),
                          pd.DataFrame(grid.cv_results_["mean_test_score"], 
                          columns=["accuracy"])],
                          axis=1)

grid_results

Unnamed: 0,max_features,min_samples_leaf,min_samples_split,n_estimators,accuracy
0,0.5,4,5,20,0.999892
1,0.5,4,5,30,0.999892
2,0.5,4,5,50,1.0
3,0.5,4,10,20,0.999964
4,0.5,4,10,30,0.999857
5,0.5,4,10,50,0.999928
6,0.5,4,15,20,0.999928
7,0.5,4,15,30,0.999928
8,0.5,4,15,50,0.999964
9,0.5,6,5,20,0.999928


In [27]:
grid_results.columns

Index(['max_features', 'min_samples_leaf', 'min_samples_split', 'n_estimators',
       'accuracy'],
      dtype='object')

In [28]:
grid_contour = grid_results.groupby([  'max_features', 'min_samples_leaf',
       'min_samples_split', 'n_estimators']).mean()
grid_contour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,accuracy
max_features,min_samples_leaf,min_samples_split,n_estimators,Unnamed: 4_level_1
0.5,4,5,20,0.999892
0.5,4,5,30,0.999892
0.5,4,5,50,1.0
0.5,4,10,20,0.999964
0.5,4,10,30,0.999857
0.5,4,10,50,0.999928
0.5,4,15,20,0.999928
0.5,4,15,30,0.999928
0.5,4,15,50,0.999964
0.5,6,5,20,0.999928
