In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import FactorAnalysis

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Load the (lightly-wrangled) data
data = pd.read_csv('responses2.csv')

# These will mess up our analysis if we keep them in the dataframe
weight = data.pop('weight')
height = data.pop('height')
age = data.pop('age')

# Remove extra column(s)
for col in data.columns:
    if 'unnamed' in col.lower():
        del data[col]

data.head()

Unnamed: 0,music,slow_songs_or_fast_songs,dance,folk,country,classical_music,musical,pop,rock,metal_or_hardrock,...,number_of_siblings,gender,left__right_handed,education,only_child,village__town,house__block_of_flats,i_am_always_on_time,i_lie_to_others,i_spend_a_lot_of_time_online
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,...,1.0,female,right handed,college/bachelor degree,no,village,block of flats,5,1,3
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,...,2.0,female,right handed,college/bachelor degree,no,city,block of flats,3,3,3
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,...,2.0,female,right handed,secondary school,no,city,block of flats,1,3,3
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,1.0,female,right handed,college/bachelor degree,yes,city,house/bungalow,3,2,5
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,...,1.0,female,right handed,secondary school,no,village,house/bungalow,5,5,3


In [3]:
# Dummify the categorical variables

for i in data.columns:
    if data[i].dtype == 'O':
        dums = pd.get_dummies(data[i], prefix=i+'_')
        del data[i]
        data = pd.concat([data, dums], axis=1)

# Now that everything is numerical, let's just use the median
# for missing values.
data.fillna(data.median(), inplace=True)

data.dtypes.value_counts()

float64    131
uint8       26
int64        8
dtype: int64

In [5]:
from sklearn.decomposition import FactorAnalysis
print('Minimum factor loadings for n factors:\n')

for n_comp in range(1,21):
    print(n_comp, end=': ')
    fa = FactorAnalysis(n_components=n_comp).fit(data)
    fa1 = fa.transform(data)

    # Stick it in a dataframe
    factors = pd.DataFrame(fa.components_, columns=data.columns).T
    
    # Record and compare minimum factor loadings
    mins = []
    for i in factors.columns:
        factors['absol'] = abs(factors[i])
        factors_sorted = factors.sort_values('absol', ascending=False).head(4) # Top 4
        mins.append(factors_sorted['absol'].min())                              # Min value
        del factors['absol']
    
    print(np.min(np.array(mins)))

Minimum factor loadings for n factors:

1: 0.5602341221357283
2: 0.560463461131604
3: 0.5609574210742054
4: 0.45260815447720626
5: 0.40342127447605636
6: 0.294939713278729
7: 0.29502709319550374
8: 0.2748624015308651
9: 0.2748486391519195
10: 0.19773762195784178
11: 0.1977399387115115
12: 0.19238189322998026
13: 0.19206553949447674
14: 0.19251423633329182
15: 0.19168970185477138
16: 0.19200738290256628
17: 0.19259667730020377
18: 0.1927836156254579
19: 0.15274986149834205
20: 0.17110742889338112


In [7]:
from sklearn.decomposition import FactorAnalysis
n_comp = 5
fa = FactorAnalysis(n_components=n_comp).fit(data)
fa1 = fa.transform(data)

# Stick it in a dataframe
factors = pd.DataFrame(fa.components_, columns=data.columns).T

# Record and compare minimum factor loadings
mins = []
for i in factors.columns:
    factors['absol'] = abs(factors[i])
    factors_sorted = factors.sort_values('absol', ascending=False).head(4) # Top 4
    mins.append(factors_sorted['absol'].min())                              # Min value
#     del factors['absol']

In [9]:
factors_sorted

Unnamed: 0,0,1,2,3,4,absol
i_wish_i_could_change_the_past_because_of_the_things_i_have_done,-0.011855,-0.165029,0.084233,-0.220877,0.440735,0.440735
i_am_always_on_time,0.14742,0.055658,-0.059057,-0.152574,-0.416822,0.416822
i_feel_lonely_in_life,-0.078342,-0.085253,0.30624,-0.220828,0.414363,0.414363
phobia_aging,-0.234151,-0.112988,-0.180848,-0.237557,0.403421,0.403421


In [9]:
from sklearn.decomposition import FactorAnalysis, PCA
from sklearn.preprocessing import MinMaxScaler

n_comp = 7

fa = PCA(n_components=n_comp).fit(data)
factors = fa.transform(data)

# Scale values
mms = MinMaxScaler().fit(factors)
factors = mms.transform(factors)

factors = pd.DataFrame(factors, columns=[str(n) for n in np.arange(1,n_comp+1,1)])

In [10]:
def show_items(factor):
    '''Returns a dataframe of correlations, sorted
    by absolute value'''
    f = pd.DataFrame(data.corrwith(factors[str(factor)]))
    f['absol'] = abs(f[0])
    return f.sort_values('absol', ascending=False).drop('absol', axis=1).head()

# This is like a mixture of extraversion and assertiveness
show_items(1)

Unnamed: 0,0
gender__male,0.692339
gender__female,-0.687821
i_cry_when_i_feel_down_or_things_dont_go_the_right_way,-0.629372
romantic,-0.553265
theatre,-0.49953


In [11]:
show_items(2)

Unnamed: 0,0
classical_music,0.568923
opera,0.515489
swing_jazz,0.491005
physics,0.466391
history,0.458684


In [12]:
show_items(3)

Unnamed: 0,0
i_spend_a_lot_of_money_on_my_appearance,-0.569359
hiphop_rap,-0.531976
i_enjoy_going_to_large_shopping_centres,-0.508881
shopping,-0.507832
i_am_always_full_of_life_and_energy,-0.47738


In [13]:
show_items(4)

Unnamed: 0,0
alternative,-0.403094
rock,-0.375889
i_try_to_do_tasks_as_soon_as_possible_and_not_leave_them_until_last_minute,0.368314
i_find_it_very_difficult_to_get_up_in_the_morning,-0.353164
i_save_all_the_money_i_can,0.351801


In [14]:
show_items(5)

Unnamed: 0,0
phobia_public_speaking,-0.421207
pc,-0.393301
i_have_to_be_well_prepared_before_public_speaking,-0.373207
scifi,-0.335202
science_and_technology,-0.334016


In [15]:
show_items(6)

Unnamed: 0,0
politics,0.485821
biology,-0.466184
chemistry,-0.422166
economy_management,0.369639
i_take_notice_of_what_goes_on_around_me,0.364542


In [16]:
show_items(7)

Unnamed: 0,0
i_believe_in_god,-0.387371
folk,-0.34537
chemistry,0.328458
biology,0.325554
medicine,0.312781
