In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
import statistics 
import scipy
from itertools import combinations 


MUSIC_CHOICES = ['classical music', 'pop', 'metal or hardrock', 'hiphop, rap', 'latino', 'alternative']

In [None]:
%cd ..

In [None]:
raw_data = pd.read_csv('./resources/responses.csv')
raw_data.fillna(0, inplace=True)
raw_data.columns = [col.lower() for col in raw_data.columns]

In [None]:
def only_strong_correlations(x):
    if abs(x) < 0.1: return 0
    return x

def convert_to_binary(col):
    """
    Given a two value categorical series, it is converted to its binary representation
    
    :param Series col: two value categorical series
    
    :return: a binary series
    """
    
    unique_val = col.unique()[0]
    copy = col.copy()
    for index, row in enumerate(copy):
        if row == unique_val:
            copy[index] = 0
        else:
            copy[index] = 1
    return copy


In [None]:
# getting all the categorical columns 
categorical_data = raw_data[list(set(raw_data.columns) - set(raw_data._get_numeric_data().columns))]
categorical_data.drop(['gender'], axis=1, inplace=True)

In [None]:
# converting the binary categorical columns to 0s and 1s
# this is done to avoid linerly dependent columns 
binary_data = categorical_data[['left - right handed', 'only child']]
for col in binary_data:
    categorical_data.loc[:, col] = convert_to_binary(binary_data.loc[:, col])

In [None]:
# one hot encoding 
categorical_data = pd.get_dummies(categorical_data, prefix_sep='_', drop_first=True)


In [None]:
def generate_heatmap(df, music_choices=MUSIC_CHOICES, threshold=0.04, apply_map=True):
    """
    Given a dataframe a heatmap is returned with only values above the 
    threhold being displayed
    
    :param DataFrame df: A numerical dataframe
    :param list music_choices: The genres of music that one would like displayed
    :param float threshold: The average value required for a row to be displayed
    
    :return: a sns heatmap
    """
    # corr is a square correlation matrix (n * n), where n is the number of featuress
    if apply_map:
        # only_corrleations() makes cell values 0 if their correlation is less than 0.1
        df = df._get_numeric_data().corr().applymap(only_strong_correlations)
    else:
        df = df._get_numeric_data().corr()
        
    # picking only the music columns 
    df = df[music_choices]
    # excluding all music rows 
    df = df.loc[set(df.index) - set(music_choices)]

    # only rows above a certain threshold are kept
    # 0.04 was chosen as the threshold since it 
    # is slightly higher than the avg of the row avgs, which was 0.039

    # avg is the average of all the rows 
    avg = 0
    inital_len = len(df)
    for index, row in df.iterrows():
        add = 0
        for col in row: 
            add += abs(col)
        avg += add/len(row)
        if add/len(row) < threshold:
            df.drop(index, axis=0, inplace=True)
    print(avg/inital_len)
    
    plt.figure(figsize=(25,20))
    return sns.heatmap(df, cmap= sns.color_palette("RdBu_r", 7), annot=True, linewidth=0.5)


In [None]:
music_data = raw_data[MUSIC_CHOICES]

In [None]:
# generate_heatmap(music_data.join(categorical_data), threshold=0.04, apply_map=False)

In [None]:
questions = []
for line in open('./resources/correlations.txt'):
    questions.append(line.lower().rstrip())
question_data = raw_data[questions]
print(len(questions))
# gender is binary so we convert that prior to OHE (One Hot Encoding)
question_data.loc[:,['gender']] = convert_to_binary(question_data['gender'])
question_data = pd.get_dummies(question_data, drop_first=True).astype(int)

In [None]:
def create_model(question_data, music_data):
    parameters = {'k': range(2,30,2), 's': [0.5, 0.7, 1.0]}
    score = 'f1_micro'   
    
    X_train, X_test, y_train, y_test = train_test_split(question_data, music_data, test_size=0.20, random_state=42)
    
    clf = GridSearchCV(MLkNN(), parameters, scoring=score, n_jobs=-1)
    clf.fit(question_data, scipy.sparse.csr_matrix(music_data.values))
    return clf.best_params_, clf.best_score_

In [None]:
best_score = 0
best_params = None
for index, comb in enumerate(list(combinations(question_data.columns, 10))):
    params, score = create_model(question_data[list(comb)], music_data)
    if score > best_score:
        best_score = score
        best_params = params
    print(index)

In [None]:


clf = MLkNN(k=2, s=0.5)
clf.fit(data, scipy.sparse.csr_matrix(music.values))
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

In [None]:
music.head()

In [None]:
data.head()

In [None]:
print(clf.predict(np.asarray([[2,3,4,3,0,4,2,20,1]])))