# Predicting personality and morality 

                               Yash Tusharbhai Desai

## Importing required packages and libraries

In [1]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import metrics
from xgboost import XGBRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


# Loading Datasets

## Loading Features

In [2]:
features_np = np.load('./all_features.npy',allow_pickle=True)

### Merging all the 229 feature values into a single array

In [3]:
one_array = []
track_ids = []
for i in range(len(features_np)):
    one_feature = np.concatenate((features_np[i][3], features_np[i][1], features_np[i][2], features_np[i][4], features_np[i][5], features_np[i][6], features_np[i][7]))
    one_array.append(one_feature)
    track_ids.append(features_np[i][0])
one_array = np.array(one_array).astype('float32')
print('The shape of our feature array is: -',one_array.shape)
one_array[0]

The shape of our feature array is: - (11180, 229)


array([ 1.29199219e+02,  1.00000000e+00,  7.41237998e-01,  4.72716004e-01,
        4.01114434e-01,  3.96107048e-01,  4.32125270e-01,  5.12008190e-01,
        5.48045576e-01,  4.96326447e-01,  4.48788851e-01,  4.30853158e-01,
        4.47439492e-01,  4.91128862e-01,  4.94462460e-01,  4.29428935e-01,
        3.61250460e-01,  3.38929445e-01,  3.62722397e-01,  4.41248506e-01,
        5.38648367e-01,  5.34732640e-01,  4.23297167e-01,  3.24935228e-01,
        2.89269835e-01,  2.83819914e-01,  3.09512287e-01,  3.36682856e-01,
        3.24578524e-01,  2.76548982e-01,  2.45640635e-01,  2.44614661e-01,
        2.66858786e-01,  2.75683612e-01,  2.34920904e-01,  1.87942147e-01,
        1.64719462e-01,  1.60023630e-01,  1.80145219e-01,  2.23499790e-01,
        2.39709392e-01,  2.01214164e-01,  1.45149335e-01,  1.12376437e-01,
        1.01751119e-01,  1.03743002e-01,  1.10554419e-01,  1.08710438e-01,
        9.30104777e-02,  8.05889890e-02,  7.25800544e-02,  7.05595836e-02,
        7.11078346e-02,  

### Creating a DataFrame for our feature vector 

In [4]:
track_df = pd.DataFrame({'track_id':track_ids})
features_one = df = pd.DataFrame(((x,) for x in one_array), columns=['feature_vector'])
feature_vector_df = pd.concat([track_df,features_one],axis=1)
feature_vector_df

Unnamed: 0,track_id,feature_vector
0,003iCjGOKi7p4eqjfLB4oe,"[129.19922, 1.0, 0.741238, 0.472716, 0.4011144..."
1,008amsyxwFSwzy1mGg1OUo,"[135.99918, 1.0, 0.7883907, 0.6127495, 0.62591..."
2,009V0srjrA7XxggtXWCUVj,"[112.347145, 1.0, 0.85913557, 0.73470086, 0.71..."
3,00C5llfInrmXgjBrT40L1J,"[95.703125, 1.0, 0.78303474, 0.54539025, 0.472..."
4,00InLesTnEYlgbiCRtQizL,"[117.453835, 1.0, 0.866055, 0.80381376, 0.8214..."
...,...,...
11175,7zrWPRYubwn8k8Id80LDCv,"[92.28516, 1.0, 0.7989711, 0.59106886, 0.54705..."
11176,7zrxGPR1UVK2iSK793vLPl,"[161.49902, 1.0, 0.73596185, 0.47786754, 0.423..."
11177,7zsiCG0hk9G7iQ5KnSPG7T,"[129.19922, 1.0, 0.7602166, 0.5001286, 0.41274..."
11178,7zv2ojb0PAuMalnaOJd0Qv,"[117.453835, 1.0, 0.81133294, 0.67105514, 0.65..."


##  Loading labels and track data

In [5]:
big5_mft_tracks = pd.read_csv('./big5_mft_tracks.csv',index_col=False)
personality_data = pd.read_csv('./filtered_big5.csv',index_col=False)
morality_data = pd.read_csv('./filtered_mft.csv',index_col=False)

## Merging our data so that we have a DataFrame with all the artists that users (pollsters) like and top 5 tracks of those artists.

In [6]:

'''
This dataframe is of the same size as our feature set (11180) and is merged on the column:-'track_id'. This will
give us the 'processed_artist_name' corresponding to all the tracks and its features. The 'processed_artist_name'
will be further used to extract artists and their songs for every pollster id in morality and personality data.

'''
features_data = pd.merge(big5_mft_tracks,feature_vector_df, on='track_id')
features_data = features_data.drop(['Unnamed: 0'],axis=1)
features_data.head(5) 

Unnamed: 0,artist_name,song_title,preview_url,track_id,artist_id,track_popularity,processed_artist_name,feature_vector
0,12th Planet,Follow,https://p.scdn.co/mp3-preview/ef07dc598b117870...,1HJW2w61lh3XcsnfnOR9p4,3V1h3kAdiVDBiwlY2i6dJz,40.0,12th planet,"[151.99908, 1.0, 0.7784917, 0.61665344, 0.6178..."
1,12th Planet,Swamplex Terrestrial,https://p.scdn.co/mp3-preview/bfe1fffd48350137...,35Qf9mP6LoopjOVwgPAKoY,3V1h3kAdiVDBiwlY2i6dJz,26.0,12th planet,"[151.99908, 1.0, 0.7149835, 0.51125443, 0.5474..."
2,12th Planet,Supernova,https://p.scdn.co/mp3-preview/a492095a74937ed7...,5Yu2XAETE0p1dLzbgoX3WN,3V1h3kAdiVDBiwlY2i6dJz,24.0,12th planet,"[151.99908, 1.0, 0.69585574, 0.4721279, 0.4757..."
3,12th Planet,Let It Bang,https://p.scdn.co/mp3-preview/5cb107204eaa46ec...,2A7C2pIp1N0sgOi0aM6z3g,3V1h3kAdiVDBiwlY2i6dJz,20.0,12th planet,"[151.99908, 1.0, 0.7084564, 0.49900118, 0.5179..."
4,12th Planet,Let Us Prey,https://p.scdn.co/mp3-preview/15a9234eff6e1126...,5TMYoiWElQzD4UsTvtX8kx,3V1h3kAdiVDBiwlY2i6dJz,15.0,12th planet,"[151.99908, 1.0, 0.7706448, 0.60469663, 0.6148..."


### Renaming features_data, personality_data and morality data so that we can perform a merge using the same column name

In [7]:
features_data.rename(columns = {'processed_artist_name':'processed_page/artist_name'}, inplace = True)
personality_data.rename(columns = {'processed_page_name':'processed_page/artist_name'}, inplace = True)
morality_data.rename(columns = {'processed_page_name':'processed_page/artist_name'}, inplace = True)

### Merging our features_data with the personality data using the column name 'processed_page/artist_name'. This will give us 81520 rows, which will have which will have  liked artists of pollster/user_ids and the top 5 songs of that artists.

In [8]:
features_personality =  pd.merge(personality_data,features_data, on=['processed_page/artist_name'])
features_personality = features_personality.drop(['artist_name','preview_url','track_id','artist_id','track_popularity','Unnamed: 0','Music/Artist_Page_name','category','song_title','processed_page/artist_name'],axis=1)
print('The number of rows entries in the features_personality dataframe are:- ',len(features_personality))
print('The number of unique users in the personality data are:-', len(features_personality['pollster_user_id'].unique()))
features_personality.head(5)

The number of rows entries in the features_personality dataframe are:-  81520
The number of unique users in the personality data are:- 481


Unnamed: 0,pollster_user_id,Extraversion,Agreeableness,Conscientiousness,Openness,Neurotisism,feature_vector
0,65,9.0,3.0,10.0,7.0,6.0,"[123.046875, 1.0, 0.92557836, 0.86233497, 0.84..."
1,65,9.0,3.0,10.0,7.0,6.0,"[123.046875, 1.0, 0.90434706, 0.853212, 0.8619..."
2,65,9.0,3.0,10.0,7.0,6.0,"[123.046875, 1.0, 0.6948316, 0.45649955, 0.445..."
3,65,9.0,3.0,10.0,7.0,6.0,"[123.046875, 1.0, 0.76202863, 0.5560844, 0.552..."
4,65,9.0,3.0,10.0,7.0,6.0,"[129.19922, 1.0, 0.9181301, 0.8782712, 0.87862..."


### Merging our features_data with the morality data using the column name 'processed_page/artist_name'. This will give us 97160 rows, which will have  liked artists of pollster/user_ids and the top 5 songs of that artists.

In [9]:
features_morality = pd.merge(morality_data,features_data, on=['processed_page/artist_name'])
features_morality = features_morality.drop(['artist_name','preview_url','track_id','artist_id','track_popularity','Unnamed: 0','Music/Artist_Page_name','category','song_title','processed_page/artist_name'],axis=1)
print('The number of rows entries in the features_morality dataframe are:- ',len(features_morality))
print('The number of unique users in the morality data are:-', len(features_morality['pollster_user_id'].unique()))
features_morality.head(5)

The number of rows entries in the features_morality dataframe are:-  97160
The number of unique users in the morality data are:- 577


Unnamed: 0,pollster_user_id,Care,Fairness,Loyalty,Authority,Purity,Individualism,SocialBinding,feature_vector
0,9,22.0,22.0,10.0,13.0,16.0,22.0,13.0,"[103.359375, 1.0, 0.62312037, 0.2714267, 0.202..."
1,9,22.0,22.0,10.0,13.0,16.0,22.0,13.0,"[117.453835, 1.0, 0.7135329, 0.41768476, 0.333..."
2,9,22.0,22.0,10.0,13.0,16.0,22.0,13.0,"[123.046875, 1.0, 0.7847495, 0.5645804, 0.4997..."
3,9,22.0,22.0,10.0,13.0,16.0,22.0,13.0,"[123.046875, 1.0, 0.8232021, 0.63342595, 0.574..."
4,9,22.0,22.0,10.0,13.0,16.0,22.0,13.0,"[123.046875, 1.0, 0.75795406, 0.4903437, 0.394..."


## Now that we have data for every user, we will take the average/mean of all the songs and all the artists so that we have exactly one feature vector (length 229) which will represent all the 7 featrues for each user.
i.e :- <i>[tempo, mean_tempogram,std_tempogram,mean_chromagram,std_chromagram,mean_tonnetz,std_tonnetz]<i>

## This means we will now have the personality features and labels for 481 unique users while morality features and labels for 577 unique users 

In [10]:
avg_personality_features = features_personality.groupby(by='pollster_user_id')['feature_vector'].mean()
avg_personality_labels = features_personality.groupby(by='pollster_user_id')['Extraversion', 'Agreeableness',
       'Conscientiousness', 'Openness', 'Neurotisism'].mean()

avg_personality_features = avg_personality_features.reset_index()
avg_personality_labels = avg_personality_labels.reset_index()
print('Computed average of features and labels (personality) for each pollster_user_id')

Computed average of features and labels (personality) for each pollster_user_id


  avg_personality_labels = features_personality.groupby(by='pollster_user_id')['Extraversion', 'Agreeableness',


In [11]:
avg_morality_features = features_morality.groupby(by='pollster_user_id')['feature_vector'].mean()
avg_morality_labels = features_morality.groupby(by='pollster_user_id')['Care', 'Fairness', 'Loyalty', 'Authority',
       'Purity', 'Individualism', 'SocialBinding'].mean()

avg_morality_features = avg_morality_features.reset_index()
avg_morality_labels = avg_morality_labels.reset_index()
# avg_personality_features = avg_personality_features.drop(['pollster_user_id'],index=1)
print('Computed average of features and labels (morality) for each pollster_user_id')

Computed average of features and labels (morality) for each pollster_user_id


  avg_morality_labels = features_morality.groupby(by='pollster_user_id')['Care', 'Fairness', 'Loyalty', 'Authority',


### Creating the avg_personality_features and avg_morality_features arrays further getting the input data to our models

In [12]:
avg_personality_features = avg_personality_features['feature_vector']
avg_morality_features = avg_morality_features['feature_vector']

### Copying the avg_personality and avg_morality featurs into a new array to change the dtype of the array from 'object' to 'float32' and converting both the lists to numpy arrays.

In [13]:
avg_personality_features_arr = []
avg_morality_features_arr = []

In [14]:
for i in range(len(avg_personality_features)):
    avg_personality_features_arr.append(avg_personality_features[i])

for j in range(len(avg_morality_features)):
    avg_morality_features_arr.append(avg_morality_features[j])  
    
avg_personality_features_arr, avg_morality_features_arr = np.array(avg_personality_features_arr), np.array(avg_morality_features_arr)

## Create inputs and outputs for our prediction models

In [15]:
# Personality
X_personality = avg_personality_features_arr


y_Extraversion = avg_personality_labels['Extraversion'].astype('float32')
y_Agreeableness = avg_personality_labels['Agreeableness'].astype('float32')
y_Conscientiousness = avg_personality_labels['Conscientiousness'].astype('float32')
y_Openness = avg_personality_labels['Openness'].astype('float32')
y_Neurotisism = avg_personality_labels['Neurotisism'].astype('float32')

#  Morality
X_morality = avg_morality_features_arr

y_Care = avg_morality_labels['Care'].astype('float32')
y_Fairness = avg_morality_labels['Fairness'].astype('float32')
y_Loyalty = avg_morality_labels['Loyalty'].astype('float32')
y_Authority = avg_morality_labels['Authority'].astype('float32')
y_Purity = avg_morality_labels['Purity'].astype('float32')
y_Individualism = avg_morality_labels['Individualism'].astype('float32')
y_SocialBinding = avg_morality_labels['SocialBinding'].astype('float32')

In [16]:
X_personality.shape, y_Extraversion.shape

((481, 229), (481,))

# Predicting Extraversion


## Random Forest Extraversion

In [17]:
X_train_Extraversion, X_test_Extraversion, y_train_Extraversion, y_test_Extraversion = train_test_split(X_personality, np.array(y_Extraversion), test_size=0.2, random_state = 42)

In [18]:
rf_Extraversion = RandomForestRegressor(n_estimators=100)
rf_Extraversion.fit(X_train_Extraversion, y_train_Extraversion)
y_rf_pred_Extraversion = rf_Extraversion.predict(X_test_Extraversion)
rf_Extraversion_score = rf_Extraversion.score(X_train_Extraversion, y_train_Extraversion)

print('The coefficient of determination of the prediction R^2 is:- ', rf_Extraversion_score)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_Extraversion, y_rf_pred_Extraversion))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_Extraversion, y_rf_pred_Extraversion))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_Extraversion, y_rf_pred_Extraversion)))

The coefficient of determination of the prediction R^2 is:-  0.8489305574517626
Mean Absolute Error: 1.2267010309278352
Mean Squared Error: 2.457003092783505
Root Mean Squared Error: 1.5674830438583713


## Linear Regressor Extraversion

In [19]:
lr_Extraversion = LinearRegression()
lr_Extraversion.fit(X_train_Extraversion, y_train_Extraversion)
y_lr_pred_Extraversion = lr_Extraversion.predict(X_test_Extraversion)
lr_Extraversion_score = lr_Extraversion.score(X_train_Extraversion, y_train_Extraversion)

print('The coefficient of determination of the prediction R^2 is:- ', lr_Extraversion_score)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_Extraversion, y_lr_pred_Extraversion))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_Extraversion, y_lr_pred_Extraversion))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_Extraversion, y_lr_pred_Extraversion)))

The coefficient of determination of the prediction R^2 is:-  0.5143668843717586
Mean Absolute Error: 2.2156188
Mean Squared Error: 8.135601
Root Mean Squared Error: 2.8522975


## XGBoost Regressor Extraversion

In [24]:
xgb_Extraversion = XGBRegressor()
cv_Extraversion = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores_Extraversion = cross_val_score(xgb_Extraversion, X_train_Extraversion, y_train_Extraversion, scoring='neg_mean_absolute_error', cv=cv_Extraversion, n_jobs=-1)
# force scores to be positive
scores_Extraversion = np.absolute(scores_Extraversion)
print('Mean MAE: %.3f (%.3f)' % (scores_Extraversion.mean(), scores_Extraversion.std()) )

Mean MAE: 1.407 (0.187)


# Support Vector Regressor Extraversion

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
svr_Extraversion = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
svr_Extraversion.fit(X_train_Extraversion, y_train_Extraversion)
y_svr_pred_Extraversion = svr_Extraversion.predict(X_test_Extraversion)
svr_Extraversion_score = lr_Extraversion.score(X_train_Extraversion, y_train_Extraversion)



print('The coefficient of determination of the prediction R^2 is:- ', svr_Extraversion_score)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_Extraversion, y_lr_pred_Extraversion))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_Extraversion, y_lr_pred_Extraversion))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_Extraversion, y_lr_pred_Extraversion)))


In [None]:
# vanilla linear regressor

In [None]:
# results = evaluate_model(X, y)

In [27]:
# 	return results