In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [37]:
data = pd.read_csv('ab5c_factor_analysis.csv')
data = data[[i for i in data.columns if 'unnamed' not in i.lower()]]
data.shape

(87, 513)

See http://imaging.mrc-cbu.cam.ac.uk/statswiki/FAQ/thresholds for threshold suggestions

In [64]:
from sklearn.decomposition import FactorAnalysis

def get_factor(remaining=data.columns, threshold=0.65):

    fa = FactorAnalysis(n_components=1, random_state=200).fit(data[remaining])

    factor = pd.DataFrame(fa.components_, columns=data[remaining].columns).T

    factor.columns = [i+1 for i in factor.columns]
    factor['absol'] = abs(factor[1])
    
    remaining = factor[np.abs(factor[1]) < threshold].index
    factor = factor[factor.absol >= threshold]
    
    factor = pd.DataFrame(factor.sort_values('absol', ascending=False)[1])

    return factor, remaining

factor, remaining = get_factor(threshold=.84)

# Assertiveness
factor

Unnamed: 0,1
iamnotsurewheremylifeisgoing,-1.04
idoalotinmysparetime,1.023
italktoalotofdifferentpeopleatparties,0.945
itakecharge,0.943
ihavedifficultyexpressingmyfeelings,-0.921
iknowwhatiwant,0.892
icaneasilypushmyselfforward,0.88
iexpressmyselfeasily,0.871
igetthingsdonequickly,0.848
iseemyselfasagoodleader,0.847


I'm still experimenting with the methodology, but if we're measuring psychological constructs, it's also important for the intercorrelations within each factor's items to be high (ideally 0.70 or greater).

I'm not achieving that with the current dataset. I may need to collect more responses or tinker with the methodology.

In [65]:
def intercorrelations(factor=factor):
    factor_questions = factor.index
    intercorrelations = {}

    for i in factor_questions:
        corrs = abs(data[factor_questions].drop(i, axis=1).corrwith(data[i]))
        intercorrelations[i] = corrs.mean()

    return pd.DataFrame([intercorrelations]).T.sort_values(0, ascending=False)
    
intercorrelations()

Unnamed: 0,0
iexpressmyselfeasily,0.403
iamnotsurewheremylifeisgoing,0.385
itakecharge,0.378
ihavedifficultyexpressingmyfeelings,0.377
icaneasilypushmyselfforward,0.362
idoalotinmysparetime,0.356
italktoalotofdifferentpeopleatparties,0.338
iknowwhatiwant,0.326
igetthingsdonequickly,0.305
iseemyselfasagoodleader,0.275


In [66]:
# Reliability
reliability, remaining = get_factor(remaining=remaining, threshold=.778)
reliability

Unnamed: 0,1
iknowhowtocomfortothers,0.858
iaccomplishmyworkontime,0.824
iwastemytime,-0.816
ihandletaskssmoothly,0.789
iturnplansintoactions,0.788


In [67]:
intercorrelations(reliability)

Unnamed: 0,0
ihandletaskssmoothly,0.423
iturnplansintoactions,0.423
iaccomplishmyworkontime,0.423
iwastemytime,0.406
iknowhowtocomfortothers,0.35


Again, there's a problem here: the item with the highest factor loading also has the weakest correlation between the remaining items. So I'm still working on solving this mystery.

In [None]:
# Further testing below

In [25]:
factor_questions = factor.index
intercorrelations = {}

for i in factor_questions:
    corrs = abs(data[factor_questions].drop(i, axis=1).corrwith(data[i]))
    intercorrelations[i] = corrs.mean()
    
pd.DataFrame([intercorrelations]).T.sort_values(0, ascending=False)

Unnamed: 0,0
iturnplansintoactions,0.397
itakecharge,0.376
icaneasilypushmyselfforward,0.366
icomeupwithboldplans,0.362
ihandletaskssmoothly,0.36
italktoalotofdifferentpeopleatparties,0.342
iamthefirsttoact,0.341
ithinkquickly,0.338
iexpressmyselfeasily,0.334
iadapteasilytonewsituations,0.326


In [7]:
# Conscientiousness
factor, remaining = get_factor(remaining=remaining, threshold=.6)
factor

Unnamed: 0,1
iradiatejoy,0.732
iseebeautyinthingsthatothersmightnotnotice,0.727
icarryoutmyplans,0.72
idonotplanahead,-0.715
igetaheadstartonothers,0.715
ikeepthingstidy,0.715
iamthelifeoftheparty,0.714
ifeelothersemotions,0.711
idislikemyself,-0.702
icheckovermywork,0.702


In [8]:
# Conscientiousness
factor, remaining = get_factor(remaining=remaining, threshold=.5)
factor

Unnamed: 0,1
igetstressedouteasily,-1.294
iameasilyhurt,-1.233
igetoverwhelmedbyemotions,-1.156
icryeasily,-1.110
iworryaboutthings,-1.092
iexperiencemyemotionsintensely,-1.006
icanbestirredupupseteasily,-0.981
itakeoffenseeasily,-0.978
iamguidedbymymoods,-0.977
iamswayedbymyemotions,-0.956
