In [290]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
import sklearn.preprocessing
import seaborn as sns
import pandas as pd
import librosa
import time
import os

In [301]:
### 초기 설정

ROOT = "./data/input_voice"    # 입력음성이 들어갈 폴더 경로 지정

In [291]:
### 데이터 불러오기

df = pd.read_csv('./data/extract_music_prop_0727_5.csv')
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,ballad.001.wav,279,0.301502,0.0933,0.184661,0.011729,1963.542134,784310.738053,2297.176669,273473.563775,...,0.01157,0.447226,0.016688,0.439343,0.013556,0.434458,0.020871,0.501614,0.013606,ballad
1,ballad.002.wav,255,0.337139,0.08983,0.18368,0.009117,1725.984439,669023.938372,2128.629147,314091.675595,...,0.012803,0.383816,0.012987,0.430362,0.012014,0.435982,0.015363,0.455834,0.015025,ballad
2,ballad.003.wav,259,0.326523,0.087577,0.169846,0.00783,2367.544639,717853.043744,2568.842639,153526.987893,...,0.015067,0.594527,0.013845,0.540449,0.011723,0.465534,0.012815,0.524416,0.015153,ballad
3,ballad.004.wav,256,0.317857,0.096261,0.280054,0.020932,2114.669946,707448.978086,2368.320464,334406.59522,...,0.01854,0.451677,0.0179,0.474699,0.022657,0.502094,0.014164,0.453098,0.018838,ballad
4,ballad.005.wav,260,0.317303,0.086282,0.129756,0.005015,1831.443128,663532.484551,2098.729334,258789.276978,...,0.014569,0.423572,0.012559,0.441743,0.011408,0.462087,0.015345,0.491318,0.017491,ballad


In [292]:
### 전처리

X = df.drop(columns=['filename', 'length', 'label'])
y = df['label']
scaler = sklearn.preprocessing.MinMaxScaler()
np_scaled = scaler.fit_transform(X)
X = pd.DataFrame(np_scaled, columns=X.columns)

In [293]:
### 라벨 스케일링

y.replace('ballad', 0, inplace=True)
y.replace('dance', 1, inplace=True)
y.replace('hiphop', 2, inplace=True)
y.replace('trot', 3, inplace=True)

In [294]:
### 데이터셋 분할

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2021)

In [304]:
### 학습 및 검증

xgb = XGBClassifier(n_estimators=500, learning_rate=0.05)
xgb.fit(X_train, y_train)
y_preds = xgb.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_preds))

Accuracy: 0.75


In [308]:
### 히트맵 출력

cm = confusion_matrix(y_test, y_preds)
plt.figure(figsize=(16, 9))
sns.heatmap(
    cm,
    annot=True, np_scaled)
plt.show()

SyntaxError: positional argument follows keyword argument (629493368.py, line 7)

In [306]:
### 상관도 확인

# for feature, importance in zip(X_test.columns, xgb.feature_importances_):
#   print('%s: %.2f' % (feature, importance))

In [309]:
### 속성 추출 함수

def extract_music_prop(filename = None):
    path = f"{ROOT}/{filename}"
    audio_info = [None for _ in range(len(audio_header))]
    y, sr = librosa.load(path)
    y, _ = librosa.effects.trim(y=y)
    length = int(len(y) / sr)
    chromagram = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=512)
    rmsTest = librosa.feature.rms(y=y)
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    rolloffTest = librosa.feature.spectral_rolloff(y=y, sr=sr)
    rolloffTest.mean()
    rolloffTest.var()
    zero_crossings = librosa.zero_crossings(y=y, pad=False)
    y_harm, y_perc = librosa.effects.hpss(y=y)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    
    audio_info[0] = filename
    audio_info[1] = length
    audio_info[2] = chromagram.mean() 
    audio_info[3] = chromagram.var()
    audio_info[4] = rmsTest.mean()
    audio_info[5] = rmsTest.var()
    audio_info[6] = spectral_centroids.mean()
    audio_info[7] = spectral_centroids.var()
    audio_info[8] = spectral_bandwidth.mean()
    audio_info[9] = spectral_bandwidth.var()
    audio_info[10] = rolloffTest.mean()
    audio_info[11] = rolloffTest.var()
    audio_info[12] = zero_crossings.mean()
    audio_info[13] = zero_crossings.var()
    audio_info[14] = y_harm.mean()
    audio_info[15] = y_harm.var()
    audio_info[16] = y_perc.mean()
    audio_info[17] = y_perc.var()
    audio_info[18] = tempo

    def normalize(x, axis=0):
        return sklearn.preprocessing.minmax_scale(x, axis=axis)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    mfccs = normalize(mfccs, axis=1)

    for i in range(19, 59, 2):
        idx = int((i - 18) / 2)
        audio_info[i] = mfccs[idx].mean()
        audio_info[1 + i] = mfccs[idx].var()
        
    audio_info[59] = None
    return audio_info

In [326]:
### 입력받은 목소리 속성 추출 후 원본 데이터프레임에 추가

df = pd.read_csv('./data/extract_music_prop_0727_5.csv', index_col='filename')
filename = "M_000008.wav"    # 임시로 더미데이터 입력
audio_info = extract_music_prop(filename)
audio_info.pop(0)
df.loc[filename] = audio_info

In [327]:
### 01

labels = df[['label']]
df = df.drop(columns=['length', 'label'])
df_scaled = sklearn.preprocessing.scale(df)
df = pd.DataFrame(df_scaled, columns=df.columns)

In [328]:
### 02

similarity = cosine_similarity(df)
sim_df = pd.DataFrame(similarity, index=labels.index, columns=labels.index)
sim_df.head()

filename,ballad.001.wav,ballad.002.wav,ballad.003.wav,ballad.004.wav,ballad.005.wav,dance.001.wav,dance.002.wav,dance.003.wav,dance.004.wav,dance.005.wav,...,hiphop.002.wav,hiphop.003.wav,hiphop.004.wav,hiphop.005.wav,trot.001.wav,trot.002.wav,trot.003.wav,trot.004.wav,trot.005.wav,M_000008.wav
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ballad.001.wav,1.0,0.336855,-0.005917,0.242139,0.26461,-0.181773,-0.371896,-0.318825,0.01655,-0.174943,...,-0.228724,0.049483,-0.262026,-0.331283,-0.330891,-0.287532,0.045172,-0.357192,0.171774,0.115073
ballad.002.wav,0.336855,1.0,0.153395,0.204387,0.317638,-0.254432,-0.5536,-0.14524,-0.02553,-0.213249,...,-0.225665,0.01487,-0.149689,-0.336122,-0.374778,-0.279492,-0.036248,-0.377975,0.061033,0.320894
ballad.003.wav,-0.005917,0.153395,1.0,-0.190158,-0.020235,0.205952,-0.016674,-0.28897,-0.16478,-0.240846,...,0.230954,-0.025173,0.247441,0.20831,-0.121698,-0.254077,0.105681,-0.312506,-0.213394,-0.068607
ballad.004.wav,0.242139,0.204387,-0.190158,1.0,-0.023907,-0.223274,-0.271168,-0.20415,-0.198957,-0.426961,...,-0.168832,0.248978,-0.110205,-0.441385,-0.157163,-0.184846,-0.093786,-0.05814,-0.161983,0.280201
ballad.005.wav,0.26461,0.317638,-0.020235,-0.023907,1.0,0.065876,-0.355403,-0.061096,0.289105,-0.011817,...,-0.046564,-0.084302,-0.031094,-0.170642,-0.301123,-0.387871,-0.026203,-0.442958,0.1294,0.053093


In [332]:
### 입력받은 음성과 유사한 곡 추천

def find_similar_songs(name, n = 9):
    try:
        series = sim_df[name].sort_values(ascending=False)
    except:
        print("해당 곡은 존재하지 않습니다.")    
    series = series.drop(name)
    
    return series.head(n).to_frame()

similar_songs = find_similar_songs(filename)
print(similar_songs)

                M_000008.wav
filename                    
ballad.002.wav      0.320894
ballad.004.wav      0.280201
ballad.001.wav      0.115073
ballad.005.wav      0.053093
hiphop.003.wav      0.020704
hiphop.001.wav     -0.038884
dance.004.wav      -0.055895
ballad.003.wav     -0.068607
trot.003.wav       -0.097941


In [364]:
### 점수가 가장 높은 장르 추천

genre_score = {}

for v in similar_songs.iterrows():
    genre = v[0].split(".")[0]
    score = v[1][0]
    if genre in genre_score: genre_score[genre] += score
    else: genre_score[genre] = score

# max(genre_score, key=genre_score.get)
for i in genre_score.get():
    print(i)

TypeError: 'builtin_function_or_method' object is not iterable