In [None]:
import json
from itertools import combinations
from itertools import compress
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from skmultilearn.adapt import MLkNN
from sklearn.metrics import hamming_loss, make_scorer, jaccard_score, multilabel_confusion_matrix, zero_one_loss, roc_auc_score
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import pickle

MUSIC_CHOICES = ['classical music', 'pop', 'metal or hardrock', 'hiphop, rap', 'latino', 'alternative']

In [None]:
%cd ..

In [None]:
raw_data = pd.read_csv('./resources/responses.csv')
raw_data.columns = [col.lower() for col in raw_data.columns]
raw_data.dropna(subset=['gender'], inplace=True)
raw_data.fillna(0, inplace=True)

In [None]:
def only_strong_correlations(x):
    if abs(x) < 0.1: return 0
    return x

def convert_to_binary(col):
    """
    Given a two value categorical series, it is converted to its binary representation
    
    :param Series col: two value categorical series
    
    :return: a binary series
    """
    
    return pd.get_dummies(col, drop_first=True)


In [None]:
# getting all the categorical columns 
categorical_data = raw_data[list(set(raw_data.columns) - set(raw_data._get_numeric_data().columns))]
categorical_data.drop(['gender'], axis=1, inplace=True)

In [None]:
# converting the binary categorical columns to 0s and 1s
# this is done to avoid linearly dependent columns 
binary_data = categorical_data[['left - right handed', 'only child']]
for col in binary_data:
    categorical_data.loc[:, col] = convert_to_binary(binary_data.loc[:, col])


In [None]:
# one hot encoding 
categorical_data = pd.get_dummies(categorical_data, prefix_sep='_', drop_first=True)

In [None]:
def generate_heatmap(df, music_choices=MUSIC_CHOICES, threshold=0.04, only_strong_corr=True):
    """
    Given a dataframe a heatmap is returned with only values above the 
    threhold being displayed
    
    :param DataFrame df: A numerical dataframe
    :param list music_choices: The genres of music that one would like displayed
    :param float threshold: The average value required for a row to be displayed
    
    :return: a sns heatmap
    """
    # corr is a square correlation matrix (n * n), where n is the number of features
    if only_strong_corr:
        # only_corrleations() makes cell values 0 if their correlation is less than 0.1
        df = df._get_numeric_data().corr().applymap(only_strong_correlations)
    else:
        df = df._get_numeric_data().corr()
        
    # picking only the music columns 
    df = df[music_choices]
    # excluding all music rows 
    df = df.loc[set(df.index) - set(music_choices)]

    # only rows above a certain threshold are kept
    # 0.04 was chosen as the threshold since it 
    # is slightly higher than the avg of the row avgs, which was 0.039

    # avg is the average of all the rows 
    avg = 0
    initial_len = len(df)
    for index, row in df.iterrows():
        add = 0
        for col in row: 
            add += abs(col)
        avg += add/len(row)
        if add/len(row) < threshold:
            df.drop(index, axis=0, inplace=True)
    print(avg/initial_len)
    
    plt.figure(figsize=(25,20))
    return sns.heatmap(df, cmap= sns.color_palette("RdBu_r", 7), annot=True, linewidth=0.5)


In [None]:
questions = []
for line in open('./resources/reduced_correlations.txt'):
    questions.append(line.lower().rstrip())
question_data = raw_data[questions]

# generate_heatmap(question_data.join(music_data), only_strong_corr=True)

In [None]:
def create_music_labels(col):
    """
    Concludes an individual likes a genere of music if they rated it greater or equal to four.
    
    :param Series col: a series of integers 
    :return: a series of bools
    
    """
    return col.apply(lambda x: True if x >= 4 else False)


In [None]:
music_data = raw_data[MUSIC_CHOICES]
for col in music_data:
    music_data[col] = create_music_labels(music_data[col])


In [None]:
# gender is binary so we convert that prior to OHE (One Hot Encoding)
question_data.gender = question_data.gender.apply(lambda row: 1 if 'female' in row else 0)

In [None]:
# OHE
question_data = pd.get_dummies(question_data, drop_first=True).astype(int)
question_data = question_data.rename(columns={'gender_1': 'gender'})

In [None]:
def create_mlknn_model(question_data, music_data, grid_search, k=None, s=None):
    if grid_search:
        parameters = {'k': range(2,17,2), 's': [0.5, 0.7, 1.0]}
        score = make_scorer(hamming_loss)   
        clf = GridSearchCV(MLkNN(), parameters, scoring=score, n_jobs=-1, cv=5)
        clf.fit(question_data.values, music_data.values)
        return clf.best_params_, clf.best_score_
    else:
        clf = MLkNN(k=k, s=s)
        clf.fit(question_data.values, music_data.values)
        return clf
    

In [None]:
# distribution of choices of music people like
d = {}
for row in music_data.values.tolist():
    row = tuple(row)
    if row in d.keys():
        d[row] += 1
    else:
        d[row] = 1

# showing only the top 10 multi-label combinations
highest_counts = sorted(d.values(), reverse=True)[:10]

for category, value in d.items():
    if value in highest_counts:
        print(list(compress(list(music_data.columns), list(category))), value)
        

In [None]:
def find_best_features(question_data, music_data):
    """
    Finds the best ten features to use for a given dataset
    Utilizes GridSearchCV to figure out the best hyperparamter values. 
    
    """
    
    best_score = 100
    best_params = None
    best_features = None

    X_train, X_test, y_train, y_test = train_test_split(question_data, music_data, test_size=0.05, random_state=42)

    feature_set = set(X_train.columns)
    feature_set.remove('gender_1')

    for index, comb in enumerate(list(combinations(feature_set, 9))):
        comb = list(comb + ('gender_1',))
        params, score = create_model(X_train[comb], y_train, True)
        # since we are using hemming score, a lower score is better
        if score < best_score:
            best_score = score
            best_params = params
            best_features = comb
        print(index)
    return best_score, best_params, best_features


In [None]:
# Uncomment and run this cell only if one would like to call find_best_features() which is a timeconsuming
# function call

# score, params, features = find_best_features(question_data[reduced_features], music_data)
# score, params, features

In [None]:
# creating the actual model to be used
with open('./resources/final_mlknn_model_values.json') as file:
    json_file = json.load(file)
features = json_file['features']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(question_data[features], music_data, test_size=0.10, random_state=42)


In [None]:
hyperparamters = json_file['hyperparameters']
mlknn_clf = create_mlknn_model(X_train, y_train, False, hyperparamters['k'], hyperparamters['s'])

In [None]:
"""
Note: The Grid Search was done on a few parameters at a time
1. Increased the learning rate to 0.15 and kept number_estimators = [100, 200, 500, 1000]
2. Tune max_depth and min_child_weight
3. Tune gamma
4. Tune subsample and colsample_bytree
5. Tuning Regularization Parameters
6. Tune learning rate and number of estimators 
"""


model = XGBClassifier(silent=False,
                      objective='multi:softmax',
                      num_class=6,
                      eval_metric = "auc", 
                      seed=27, 
                      early_stopping_rounds=50, 
                      max_depth=2, 
                      gamma=0.3, 
                      min_child_weight=5, 
                      subsample=0.7, 
                      colsample_bytree=0.80, 
                      reg_alpha=0.005,
                      learning_rate=0.1, 
                      n_estimators=200,
                      n_jobs=-1)

clf = OneVsRestClassifier(model)

fit_params={
    "early_stopping_rounds": 42, 
    "eval_metric" : ["auc","error"], 
    "eval_set" : [[X_train, y_train]]
}

param_grid = {
    "estimator__n_estimators" : [100, 200, 500, 1000, 2000, 5000, 7000],
    "estimator__learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1]
}



In [None]:
gridsearch = GridSearchCV(clf, param_grid, verbose=True, cv=4, n_jobs=-1, scoring=make_scorer(roc_auc_score))
gs_model = gridsearch.fit(question_data[features], music_data)
gs_model.best_estimator_, gs_model.best_score_
results = pd.DataFrame(gs_model.cv_results_)
results.sort_values(by=['rank_test_score'])

In [None]:
final_xgb_model = XGBClassifier(silent=False,
                                objective='multi:softmax',
                                num_class=6,
                                eval_metric = "auc", 
                                seed=27, 
                                early_stopping_rounds=50, 
                                max_depth=2, 
                                gamma=0.3, 
                                min_child_weight=5, 
                                subsample=0.7, 
                                colsample_bytree=0.80, 
                                reg_alpha=0.005,
                                learning_rate=0.03, 
                                n_estimators=500, 
                                n_jobs=-1)
xgb_clf = OneVsRestClassifier(model, n_jobs=-1)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
final_vals = {}
final_vals['features'] = features

xgb_params = xgb_clf.get_params()
keys = list(xgb_params.keys())
keys.pop()
keys.pop()

In [None]:
xgb_hyperparameters = {key:xgb_params[key] for key in keys}
xgb_hyperparameters    

In [None]:
final_vals['hyperparameters'] = xgb_hyperparameters
final_vals['genres'] = list(music_data.columns)
with open('./resources/final_xgboost_model_values.json', 'w') as f:
    json.dump(final_vals, f, ensure_ascii=False, indent=4)

In [None]:
# pickle.dump(clf, open('./model/xgboost_model.sav', 'wb'))

In [None]:
roc_auc_score(y_test, mlknn_clf.predict(X_test.values).toarray())

In [None]:
roc_auc_score(y_test, xgb_clf.predict(X_test))

In [None]:
log_reg = OneVsRestClassifier(LogisticRegression(max_iter=500), n_jobs=-1)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
roc_auc_score(y_test, log_reg.predict(X_test))

In [None]:
log_reg.predict([[2, 4, 3, 4, 3, 2, 2, 4, 4, 0]])

In [None]:
features

In [None]:
log_reg.coef_