In [14]:
## KNN
## Logistic Regression
## Multilayer Perception
## Support Vector Machine

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report
from features_preprocessing import partial_dataset, full_dataset 
import pandas as pd
import numpy as np
from joblib import dump, load
import random 

def predict_test_dataset(model,test_dataset,test_corrupted_id,test_csv):
    prediction = model.predict(test_dataset)
    dataset = np.stack([test_csv,prediction],axis=1)
    for c in test_corrupted_id:
        new = np.array([[c,random.randint(1,8)]])
        dataset=np.append(dataset,new,axis=0)
    dataset = np.int_(sorted(dataset,key=lambda x: x[0]))
    return dataset
    

def create_models(X_train,y_train):
    model_switcher={
    'KNN' : make_pipeline(RobustScaler(),KNeighborsClassifier(n_neighbors=25,weights='distance', p=2,metric='manhattan')),
    'LR' :make_pipeline(RobustScaler(),LogisticRegression(random_state=50,max_iter=1200,n_jobs=-1)),
    'MLP' :make_pipeline(RobustScaler(),MLP(random_state=50, max_iter=800)),
    'SVM' :make_pipeline(RobustScaler(),SVC(gamma=0.5,C=0.1)),
    'SVM_poly' :make_pipeline(RobustScaler(),SVC(kernel='poly',coef0=1.0,degree=2,gamma=0.1,C=0.1)),
    }
    results =[]
    for ms in model_switcher:
        model = model_switcher.get(ms)
        cv =  cross_validate(model, X_train, y_train, cv=75, n_jobs=-1, return_estimator=True,verbose=5)
        results.append([ms,cv])
    # get best model 
    models=[]
    for r in results:
        name = r[0]
        test_scores = r[1]['test_score'].tolist()
        best_index = test_scores.index(max(test_scores))
        best_score = max(test_scores)
        m= r[1]['estimator'][best_index]
        dump(m,'./models/' +name+'.joblib')
        models.append([name,m,best_score])    
    models = sorted(models, key=lambda x: x[2],reverse=True)
    return models

def model_predict(X_train, X_test, y_train, y_test):
    models = create_models(X_train, y_train)
    for model in models:
        prediction = model[1].predict(X_test)
        name = model[0]
        print(name, accuracy_score(prediction,y_test))

## Data generation

In [15]:
train_dataset,test_dataset,test_corrupted_id = partial_dataset()

X_train, test_dataset, y_train = full_dataset()

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2,shuffle = True)


df = pd.read_csv(r'./data/test.csv')
test_csv=df['track_id'].values

for c in test_corrupted_id:
    test_csv = test_csv[np.where(test_csv != c)]
test_csv = np.asarray(sorted(test_csv.tolist()))

(3995, 520) (4008, 519) (3995, 2) (106574, 519)
(4008, 519) test shape
(4006, 519) test shape
(3995, 518) (3995,) (4006, 518)


In [18]:
model_predict(X_train, X_test, y_train, y_test)

In [16]:
model = load('./models/MLP.joblib')
test_csv = predict_test_dataset(model,test_dataset,test_corrupted_id,test_csv)
results = pd.DataFrame(data = {'track_id': test_csv[:,0], 'genre_id': test_csv[:,1]}, columns = ['track_id', 'genre_id'])
results.to_csv("results.csv", index=False)