In [None]:
import torch
import pandas as pd
import numpy as np
import nltk
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
import math
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [None]:
target_id = {'Rock': 0,
             'Indie':1,
             'Ambient':2,
             'HipHop':3,
             'Blues':4,
             'Soul':5,
             'Metal':6,
             'Country':7,
             'Pop':8,
             'R&B':9,
             'Disco':10,
             'EDM':11,
             'Jazz':12,
             'Lofi':13,
             'Classical':14,
             'Funk':15
             }
inv_target = dict()
for key,val in target_id.items():
    inv_target[val] = key

In [None]:
meta1_path = "/content/meta_data_1.csv"
meta2_path = "/content/meta_data_2.csv"
train_path = "/content/train.csv"
test_path = "/content/test.csv"
meta1_data = pd.read_csv(meta1_path)
meta2_data = pd.read_csv(meta2_path)
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
total_train = train_data.merge(meta1_data,left_on="song_id",right_on="id",how='left')
total_train = total_train.merge(meta2_data, on="id", how='left')
total_test = test_data.merge(meta1_data,left_on='song_id',right_on='id',how='left')
total_test = total_test.merge(meta2_data,on='id',how='left')

In [None]:
common_hip_hop_words = [
    'money', 'hustle', 'street', 'life', 'love', 'game', 'rhyme', 'time',
    'world', 'city', 'people', 'dream', 'power', 'real', 'style', 'mind',
    'night', 'music', 'party', 'heart'
]
common_funk_words = [
    "groove", "funky", "dance", "soul", "party", "music",  "get",  "feel",
    "good",  "love",  "baby",  "shake",  "time",  "live",  "bass", "rhythm",
    "jam",  "move",  "night",  "beat"]

common_words_in_classical = [
    "symphony", "sonata", "concerto", "opera", "overture",
    "chamber", "solo", "adagio", "allegro", "tempo", "cadenza",
    "movement", "prelude", "aria", "concertmaster", "counterpoint",
    "conductor", "harmony", "crescendo", "recitative"
    ]
common_words_in_lofi = [
    "chill", "relax", "vibe", "beat", "melody", "peace", "coffee", "rain",
    "nostalgia", "mellow", "jazz", "study", "lounge", "soul", "cozy",
    "sleep", "dream", "guitar", "calm", "serene"
    ]
common_words_in_jazz = [
    "jazz", "blues", "love", "life", "night", "song", "soul", "time",
    "heart", "dream", "light", "sweet", "dance", "swing", "star",
    "moon", "magic", "whisper", "melody", "rain"
    ]
common_words_in_edm = [
    "party", "dance", "love", "night", "music", "feel", "tonight",
    "life", "light", "heart", "beat", "world", "sun", "sky", "dream",
    "good", "electronic", "energy", "rave", "drop"
    ]
common_words_in_disco = [
    "dance", "love", "night", "music", "feel", "party", "get", "baby",
    "time", "groove", "light", "heart", "move", "soul", "tonight", "hot",
    "beat", "rhythm", "funky", "fun"
    ]
common_words_in_rnb = [
    "love", "baby", "heart", "time", "girl", "feel", "good",
    "night", "want", "need", "let", "way", "man", "life",
    "soul", "dance", "come", "tonight", "dream", "touch"
    ]
common_words_in_pop = [
    "love", "baby", "heart", "night", "feel", "time", "world", "dance",
    "dream", "life", "tonight", "together", "beautiful", "girl",
    "kiss", "party", "forever", "shine", "everybody", "fire"
    ]

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import re

def remove_stopwords(data):
    data = data.lower()
    data = data.split(' ')
    data[-1] = re.sub(r"\d+Embed","",data[-1])
    data = " ".join(data)
    data = ' '.join([word for word in data.split() if word not in stop_words])
    return data

def remove_junk(data):
    begining = r'^[^\[]*\['
    brackets = r"[\(\[].*?[\)\]]"
    punctuation = r'[^\w\s]'
    final_word_noise = r'\s+\w+\s*$'
    data = re.sub(begining,"[",data)
    data = re.sub(brackets,"",data)
    data = re.sub(punctuation,"",data)
    #data = re.sub(final_word_noise,"",data)
    return data

In [None]:
total_train['lyrics'] = total_train['lyrics'].fillna(' ')
total_test['lyrics'] = total_test['lyrics'].fillna(' ')

In [None]:
stop_words = set(stopwords.words('english'))
train_words = nltk.word_tokenize(str(total_train['lyrics']))
test_words = nltk.word_tokenize(str(total_test['lyrics']))

In [None]:
total_train['lyrics'] = total_train['lyrics'].apply(
    lambda x: remove_junk(remove_stopwords(x)) if not pd.isna(x) else "").reset_index(drop=True)

total_test['lyrics'] = total_test['lyrics'].apply(
    lambda x: remove_junk(remove_stopwords(x)) if not pd.isna(x) else "").reset_index(drop=True)

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words= 'english',
    ngram_range=(1, 1),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features= 1000)

word_vectorizer.fit(pd.concat([ total_train.loc[:,'lyrics'],
                                total_test.loc[:,'lyrics']
                               ]))

In [None]:
train_word_features = word_vectorizer.transform(total_train['lyrics'])
test_word_features = word_vectorizer.transform(total_test['lyrics'])

In [None]:
traindf = pd.DataFrame(train_word_features.todense(),index=total_train['song_id'])
testdf = pd.DataFrame(test_word_features.todense(), index = total_test['song_id'])

In [None]:
traindf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA5aMeYP1klLv1BA,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
Wr3MLRGLm08yjrGN,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
XtEbP8zIOloM6r5I,0.050175,0.046700,0.049704,0.05073,0.049404,0.05442,0.071341,0.066018,0.050957,0.068927,...,0.057072,0.0,0.0,0.0,0.016289,0.034911,0.036735,0.047417,0.0,0.0
NW5kIVzyoiV0zJmJ,0.000000,0.000000,0.120803,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1uhYBNywheqCdzrg,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eG4COUJlcK/ZMD+W,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
/ug0DPx1im8Xgh5C,0.092605,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.095399,0.0,0.0,0.0,0.000000,0.000000,0.194566,0.000000,0.0,0.0
hxSq3A9uzOItwFlx,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
aj+zlONILR95PDwX,0.109734,0.138556,0.132608,0.12849,0.131284,0.00000,0.000000,0.000000,0.084633,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.133095,0.000000,0.000000,0.0,0.0


In [None]:
# total_train[total_train['target'] == "Rock"]
total_train["target"] = total_train["target"].apply(lambda x: target_id[x])

In [None]:
Artists_train = {}
artist_train = list(total_train['artist'].unique())
for a in range(len(artist_train)):
    Artists_train[artist_train[a]] = a+1
len(Artists_train), len(artist_train)

(3309, 3309)

In [None]:
Artists_test = {}
new_artists = list(total_test['artist'].unique())
start = len(Artists_train) + 1
for artist in new_artists:
    if artist in artist_train:
        Artists_test[artist] = Artists_train[artist]
    else:
        Artists_test[artist] = start
        start += 1
len(Artists_test),

(2405,)

In [None]:
inf_index_list = []
for i, row in total_train.iterrows():
    if row["a41"] == float('inf') or row["a41"] == np.inf or row["a41"] == -np.inf:
        inf_index_list.append(i)
    if row["a46"] == float('inf') or row["a41"] == np.inf or row["a41"] == -np.inf:
        inf_index_list.append(i)

print(len(inf_index_list),len(total_train), inf_index_list)

2 4961 [4060, 4060]


In [None]:
total_train = total_train.drop(axis = 0,index = list(set(inf_index_list))).reset_index(drop=True)
total_train.shape

(4960, 74)

In [None]:
train_data_filtered = total_train.drop(columns=[ 'id','release_date','duration','total_tracks','lyrics','album','track'])
test_data_filtered = total_test.drop(columns=[ 'id','release_date','total_tracks','duration','lyrics','album','track'])

In [None]:
train_data_filtered["artist"] = train_data_filtered["artist"].apply(lambda x: Artists_train[x])
test_data_filtered["artist"] = test_data_filtered["artist"].apply(lambda x: Artists_test[x])

In [None]:
total_train_feat = train_data_filtered.merge(traindf,left_on='song_id',right_on='song_id' ,how= 'left')
total_test_feat = total_test.merge(testdf,left_on='song_id',right_on='song_id' ,how= 'left')

In [None]:
total_test_feat = test_data_filtered.merge(testdf, left_on='song_id', right_on='song_id', how='left')

In [None]:
# cv_scores = []
# submission = {}
# d_test = xgb.DMatrix(test_word_features)

# # for class_name in target_id.keys():
#     # train_target = total_train[total_train['target'] == class_name]
#     # print(train_target.shape)
#     # Split out a validation set
# X_train, X_valid, y_train, y_valid = train_test_split(
#     train_word_features, total_train['target'], test_size=0.25, random_state=23)

# xgb_params = {'eta': 0.3,
#               'num_class': 16,
#             'max_depth': 5,
#             'subsample': 0.8,
#             'colsample_bytree': 0.8,
#             'objective': 'multi:softmax',
#             'seed': 23
#             }

# d_train = xgb.DMatrix(X_train, y_train)
# d_valid = xgb.DMatrix(X_valid, y_valid)

# watchlist = [(d_valid, 'valid')]
# model = xgb.train(xgb_params, d_train, 500, watchlist, verbose_eval=False, early_stopping_rounds=30)
# # print("class Name: {}".format(class_name))
# print(model.attributes())

# cv_scores.append(float(model.attributes()['best_score']))
# vpreds = model.predict(d_valid)
# tpreds = model.predict(d_test)
# fscore = f1_score(y_valid, vpreds, average= 'weighted')
# print("f1 for valid:",fscore*100)
# acc = accuracy_score(y_valid, vpreds)
# print("acc for valid:", acc)
# print(fscore, acc, tpreds)

#     # del X_train, X_valid, y_train, y_valid

# print('Total CV score is {}'.format(np.mean(cv_scores)))
# # submission.to_csv('submission.csv', index=False)

In [None]:
# funk_words = list()
# for i in list(funk_lyrics):
#     funk_words += i
# funk_word = set(funk_words)
# print(funk_word), len(funk_word)

In [None]:
train_columns = list(total_train_feat.columns)
train_columns = [i for i in train_columns if i != "target" and i != 'song_id']
len(train_columns), print(train_columns),

['artist', 'adaptibility', 'danceability', 'energy', 'explicit', 'happening', 'instrumentalness', 'loudness', 'mode', 'naturality', 'positiveness', 'reputation', 'speechiness', 'tempo', 'time_signature', 'a1', 'a10', 'a11', 'a12', 'a13', 'a14', 'a15', 'a16', 'a17', 'a18', 'a19', 'a2', 'a20', 'a21', 'a22', 'a23', 'a24', 'a25', 'a26', 'a27', 'a28', 'a29', 'a3', 'a30', 'a31', 'a32', 'a33', 'a34', 'a35', 'a36', 'a37', 'a38', 'a39', 'a4', 'a40', 'a41', 'a42', 'a43', 'a44', 'a45', 'a46', 'a47', 'a48', 'a49', 'a5', 'a50', 'a6', 'a7', 'a8', 'a9', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 

(1065, None)

In [None]:
storing_mean, storing_std = {}, {}
for i in total_train_feat.columns:
    if (total_train_feat[i].dtypes == "object") or (i == "target") :
        continue
    # print(i,total[i].dtypes)
    storing_mean[i] = np.round(total_train_feat[i].mean(),3)
    storing_std[i] = np.round(total_train_feat[i].std(),3)
    if storing_mean[i] == np.inf:
        storing_mean[i] = 0.0
    if storing_std[i] == np.inf:
        storing_std[i] = 0.0
    total_train_feat[i] = (storing_mean[i] - total_train_feat[i])/ storing_std[i]
len(storing_mean), print(storing_std, storing_mean)

{'artist': 1.0, 'adaptibility': 1.0, 'danceability': 10.796, 'energy': 14.064, 'explicit': 0.999, 'happening': 1.004, 'instrumentalness': 1.0, 'loudness': 3.977, 'mode': 1.0, 'naturality': 1.001, 'positiveness': 0.988, 'reputation': 1.0, 'speechiness': 0.989, 'tempo': 23.864, 'time_signature': 1.001, 'a1': 111603.879, 'a10': 1.0, 'a11': 0.982, 'a12': 11.962, 'a13': 6.899, 'a14': 100.031, 'a15': 78328.694, 'a16': 1.032, 'a17': 111603.879, 'a18': 1.0, 'a19': 41.977, 'a2': 11.962, 'a20': 7.034, 'a21': 0.99, 'a22': 1.022, 'a23': 2.037, 'a24': 2.155, 'a25': 671.415, 'a26': 1.013, 'a27': 0.955, 'a28': 21.791, 'a29': 1.0, 'a3': 100.031, 'a30': 1.004, 'a31': 0.993, 'a32': 226.266, 'a33': 1.725, 'a34': 0.999, 'a35': 0.984, 'a36': 21.791, 'a37': 0.976, 'a38': 21.524, 'a39': 3029.035, 'a4': 42.102, 'a40': 7.605, 'a41': 0.992, 'a42': nan, 'a43': 2.858, 'a44': 0.999, 'a45': 2.704, 'a46': 0.893, 'a47': 183.642, 'a48': 32.167, 'a49': 0.998, 'a5': 56.877, 'a50': 0.987, 'a6': 9.412, 'a7': 14.179, 'a8':

(1065, None)

In [None]:
# for k,v in storing_mean.items():
#     if v == np.inf:
#         print(k)

In [None]:
for i in total_train_feat.columns:
    if i in storing_mean.keys():
        total_train_feat[i] = total_train_feat[i].fillna(storing_mean[i])
    val = total_train_feat[i].isna().sum()
    if val > 0:
        print(f"{i} , {val} ")

XGBoost training

In [None]:
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import xgboost as xgb
# read data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, accuracy_score

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(train_data_filtered.loc[:,(train_data_filtered.columns !="target")], train_data_filtered['target'], test_size=.2)

In [None]:
import random
random.seed(2023)
np.random.seed(2023)
def get_random(start,end,step, dtype = ''):
    if dtype == 'float':
        rand = random.uniform(start, end)
        # print(rand)
        randf = round(rand/step) * step
        rand = np.round(randf,3)
    elif dtype == '':
        rand = random.randrange(start,end,step)
    return rand

In [None]:
sss = StratifiedShuffleSplit(n_splits=1,test_size=0.2, random_state=500)
sss.get_n_splits()

1

In [None]:
def train(data, test_data):
    submission = {}

    # for class_name in target_id.values():
        # target_data = data[data['target'] == class_name].reset_index(drop=True)
        #print(target_data)
    target_data = data
    X = target_data.loc[:,train_columns]
    y = target_data['target']
    # test_d = xgb.DMatrix(test_data.loc[:,train_columns])
    best_score = -1
    bst_model = ''

    for i, (train_idx, val_idx) in enumerate(sss.split(X,y)):
        print(f'Fold {i}:')
        train_X, val_X, train_y, val_y = X.loc[train_idx], X.loc[val_idx], y.loc[train_idx], y.loc[val_idx]
        trainset = xgb.DMatrix(train_X, label=train_y, missing=np.inf)
        valset = xgb.DMatrix(val_X, label = val_y, missing = np.inf)
        # get parameters
        max_depth = get_random(2,5,1)
        subsample = get_random(0.5,0.9,0.1,'float')
        colsample_bylevel = get_random(0.5,0.9,0.1,'float')
        eta = get_random(0.01,0.9,0.1, 'float')
        gamma = get_random(0.1,0.8,0.1,'float')
        reg_lambda = get_random(1,3,1)
        alpha = get_random(1,3,1)
        min_child_weight = get_random(1,8,1)

        params = { 'num_class': 16, 'eta': eta, 'booster': 'gbtree',
                'max_depth': max_depth, 'subsample': subsample, 'colsample_bylevel': colsample_bylevel,
                'gamma': gamma, 'reg_lambda': reg_lambda, 'alpha': alpha,
                'min_child_weight': min_child_weight, 'sampling_method': 'uniform',
                'tree_method': 'exact', 'objective': 'multi:softmax'}

        num_round = get_random(100,200,50)
        # start training
        xgb_model = xgb.train(params, trainset, num_boost_round = num_round)
        # run predictions on validation and check the accuracy
        # val_preds = xgb_model.predict(valset, strict_shape=False)
        # dtrain = xgb_model.predict(trainset, output_margin=True)
        # dtest = xgb_model.predict(valset, output_margin=True)
        # set these as base margin
        # trainset.set_base_margin(dtrain)
        # valset.set_base_margin(dtest)
        # feature weights for each column
        # feature_score = xgb_model.get_fscore()
        # sortedScores = {k : v for k,v in sorted(feature_score.items(), key= lambda item : item[1], reverse = False)}
        # fw = []
        # base = 10
        # for col in train_columns:
        #     if col in feature_score.keys():
        #         fw.append(feature_score[col])
        #     else:
        #         fw.append(base)
        # trainset.set_info(feature_weights= fw)
        # # start training again
        # bst_model = xgb.train(params, trainset, num_round, xgb_model = xgb_model)
        final_preds = xgb_model.predict(valset)
        score = f1_score(val_y, final_preds, average = "weighted")
        # check the top score and save the best model
        if score > best_score:
            best_score = score
            # print("Score is:", np.round(best_score*100,2))
            # print("found the best model with params:", params, " and rounds:", num_round)
            acc = accuracy_score(val_y, final_preds)
            print("final score is ",round(score*100,2), "acc:", acc*100)
            xgb_model.save_model(("Train_5_best_model"  + ".json"))

                # xgb_model.
            # submission[class_name] = (best_score, acc)
    return xgb_model, params

best_model, params = train(total_train_feat, total_test_feat)

Fold 0:
final score is  59.06 acc: 59.2741935483871


In [None]:
baseline = pd.DataFrame(columns = ["song_id","target"])
for j in range(len(list(preds))):
    i = preds[j]
    out = inv_target[int(i)]
    song_id = total_test_feat.loc[j]["song_id"]
    result = {"song_id" : song_id, "target" : out } # "true_target": true_y}
    baseline = pd.concat([baseline, pd.DataFrame.from_records([result])])
baseline.to_csv("final.csv",index=False)

In [None]:
baseline

Unnamed: 0,song_id,target
0,7c61FpilqRU/3Ley,Pop
0,EmqUjbC3coby/LZy,EDM
0,lvF5H8aYwo+TlFJe,EDM
0,O+oGRFmYSUbebxCK,Pop
0,rUR7HzUw1p41lUUn,Pop
...,...,...
0,ObfXKLfo3N9IuZGw,Pop
0,qCxgC5trW/Xl/wC8,Funk
0,z8dKvyoqkEVA1aKZ,Disco
0,s2RNjtkc0Rzt5smL,EDM


In [None]:
preds = best_model.predict(xgb.DMatrix(total_test_feat.loc[:,train_columns]), strict_shape=True)
# list(preds)

In [None]:
def evaluator(data, model):
    baseline = pd.DataFrame(columns = ["song_id","target"])
    for index,row in tqdm(data.iterrows(),total=data.shape[0]):
        id = row["song_id"]
        X = data.loc[index][train_columns]
        for i in train_columns:

            if pd.isna(X[i]) or X[i] == np.inf or X[i] == -np.inf:
                X[i] = storing_mean[i]

        X = X.to_frame().T
        X = X.astype({i:'float' for i in X.columns})
        for i in list(X):
            print(X[i])
        output = model.predict(xgb.DMatrix(X))
        predicted_target = inv_target[int(output)]
        result = {"song_id" : id, "target" : predicted_target } # "true_target": true_y}
        baseline = pd.concat([baseline, pd.DataFrame.from_records([result])])
    return baseline

In [None]:
for i in list(total_test_feat.loc[18].values):
    if i == np.inf or i == -np.inf:
        print(i)

In [None]:
baseline_test = evaluator(total_test_feat.loc[18:20], best_model)

  0%|          | 0/3 [00:00<?, ?it/s]

18    1914.0
Name: artist, dtype: float64
18    2.0
Name: adaptibility, dtype: float64
18    24.5
Name: danceability, dtype: float64
18    67.0
Name: energy, dtype: float64
18    0.0
Name: explicit, dtype: float64
18    0.436
Name: happening, dtype: float64
18    0.843
Name: instrumentalness, dtype: float64
18   -1.165
Name: loudness, dtype: float64
18    6.0
Name: mode, dtype: float64
18    0.25
Name: naturality, dtype: float64
18    0.547
Name: positiveness, dtype: float64
18    30.0
Name: reputation, dtype: float64
18    0.0343
Name: speechiness, dtype: float64
18    66.805
Name: tempo, dtype: float64
18    4.0
Name: time_signature, dtype: float64
18    13827.188
Name: a1, dtype: float64
18    4.215
Name: a10, dtype: float64
18    0.4915
Name: a11, dtype: float64
18    33.4025
Name: a12, dtype: float64
18    2.499
Name: a13, dtype: float64
18    30.183
Name: a14, dtype: float64
18    300763.0
Name: a15, dtype: float64
18    2.0508
Name: a16, dtype: float64
18    13827.188
Name: a17,

  0%|          | 0/3 [00:01<?, ?it/s]


XGBoostError: ignored

In [None]:
baseline_test

In [None]:
# baseline_test = baseline_test.drop(columns=["true_target"])

In [None]:
baseline_test.to_csv("baseline_test2.csv",index=False)

In [None]:
predicted_y = xgb_model.predict(xgb.DMatrix(X_val),strict_shape=True)
correct = 0
y_val = list(y_val)
for i in range(len(predicted_y)):
    if int(predicted_y[i][0]) == y_val[i]:
        correct += 1
print(f"Acuracy : {correct/len(y_val)}")


Acuracy : 0.6002014098690835


In [None]:
testset = xgb.DMatrix(test_data_filtered.loc[:,test_data_filtered.columns != 'song_id'])
xgb_test_preds = xgb_model.predict(testset,strict_shape=True)

In [None]:
# create model instance
bst = XGBClassifier(n_estimators=10, num_class = 16,
                    max_depth=5, subsample = 0.7,
                    eta=0.01, gamma = 0.5, reg_lambda = 0.3, alpha= 1,
                    min_child_weight = 0.7,
                    sampling_method = 'uniform', tree_method = 'exact',
                    objective='multi:softmax')
