In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt

MUSIC_CHOICES = ['Classical music', 'Pop', 'Metal or Hardrock', 'Hiphop, Rap', 'Latino', 'Alternative', 'Rock']

In [None]:
%cd ..

In [None]:
raw_data = pd.read_csv('./resources/responses.csv')
raw_data.fillna(0, inplace=True)

In [None]:
def only_correlations(x):
    if abs(x) < 0.1: return 0
    return x

In [None]:
raw_data.describe().iloc[:,:19]

In [None]:
# getting all the categorical columns 
categorical_data = raw_data[list(set(raw_data.columns) - set(raw_data._get_numeric_data().columns))]

In [None]:
label_encoder = LabelEncoder()

# integer encode
int_encoded = label_encoder.fit_transform(categorical_data)
print

In [None]:
# only_corrleations() makes cell values 0 if their correlation is less than 0.1
# corr is a square correlation matrix (n * n), where n is the number of featuress
corr = raw_data._get_numeric_data().corr().applymap(only_correlations)

# picking only the music columns 
corr = corr[MUSIC_CHOICES]
# excluding all music rows 
corr = corr.iloc[19:,:]


# only rows above a certain threshold are kept
# 0.04 was chosen as the threshold since it 
# is slightly higher than the avg of the row avgs, which was 0.039

# avg is the average of all the rows 
avg = 0

for index, row in corr.iterrows():
    add = 0
    for col in row: 
        add += abs(col)
    avg += add/len(row)
    
    if add/len(row) < 0.04:
        corr.drop(index, axis=0, inplace=True)
print(avg/len(corr))
# corr = corr.iloc[11:,:]

In [None]:
plt.figure(figsize=(25,20))
sns.heatmap(corr, cmap= sns.color_palette("RdBu_r", 7), annot=True, linewidth=0.5)


In [None]:
music = raw_data.iloc[:,:19]
music = music[['Classical music', 'Pop', 'Metal or Hardrock','Rock n roll', 'Hiphop, Rap', 'Alternative', 'Latino']].astype(int)
movie = raw_data.iloc[:,19:31].astype(int)
hobbies = raw_data.iloc[:,31:63].astype(int)
phobias = raw_data.iloc[:,63:73].astype(int)
health = pd.get_dummies(raw_data.iloc[:,73:76],drop_first=True).astype(int)
traits = pd.get_dummies(raw_data.iloc[:,76:133],drop_first=True).astype(int)
habits = pd.get_dummies(raw_data.iloc[:,133:120],drop_first=True).astype(int)


In [None]:
music.describe()

In [None]:
raw_data.iloc[:,19:]

In [None]:
raw_data.iloc[:,]

In [None]:
questions = raw_data[['Criminal damage', 'Adrenaline sports', 'Passive sport', 'Waiting', 'Eating to survive', 'Friends versus money', 'Cheating in school']].astype(int)



In [None]:
general = raw_data[['Age', 'Gender']]
general['Gender'] = general['Gender'].apply(lambda row: 0 if row == 'female' else 1)
general = general.astype(int)

In [None]:
music = raw_data[['Country', 'Classical music', 'Pop', 'Rock', 'Hiphop, Rap', 'Alternative', 'Loneliness', 'God']].astype(int)
for col in music:
    median = music[col].median()
    music[col] = music[col].apply(lambda row: 1 if row > median else 0)
data = questions.join(general)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(data, music, test_size=0.33, random_state=42)

In [None]:
X_train

In [None]:
import scipy
print(scipy.sparse.csr_matrix(y_train.values).shape)


In [None]:
np.asarray(data)

In [None]:
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV


parameters = {'k': range(2,30,2), 's': [0.5, 0.7, 1.0]}
score = 'f1_micro'

In [None]:


# clf = GridSearchCV(MLkNN(), parameters, scoring=score, n_jobs=-1)
# clf.fit(data, scipy.sparse.csr_matrix(music.values))
# print(clf.best_params_, clf.best_score_)

clf = MLkNN(k=2, s=0.5)
clf.fit(data, scipy.sparse.csr_matrix(music.values))
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

In [None]:
music.head()

In [None]:
data.head()

In [None]:
print(clf.predict(np.asarray([[2,3,4,3,0,4,2,20,1]])))

In [None]:
def train_clfs(X_train, y_train, X_test, y_test):
    clfs = {
        "Knn": KNeighborsClassifier(n_neighbors=10),
        "RandomForest":RandomForestClassifier(n_estimators=50),
        "ID3" : DecisionTreeClassifier(criterion='entropy'),
        "CART" : DecisionTreeClassifier()
    }
    
    for clf_name in clfs:
        clf = clfs[clf_name]
        clf = clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        print (i," Accuracy Score: ",accuracy_score(y_test, predicted))

In [None]:
train_clfs(X_train, X_test, y_train, y_test)