In [584]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import ttest_ind
from numpy import random

### Handling the Imagability data

In [692]:
df_img = pd.read_csv('Stimuli/mturk_img_ratings.csv', header = 0)
df_img = df_img[df_img.imagability_complete == 2]
df_img = df_img.transpose()
df_img.reset_index(level=0, inplace=True)
df_img_just_words = df_img[3:348] # we just care about the words for now
df_img_just_words = df_img_just_words.apply(lambda x: pd.to_numeric(x, errors='ignore')) #Need to make all rows numeric
df_img_just_words['Imagability'] = df_img_just_words.sum(axis=1)
df_img_just_words['Imagability'] = df_img_just_words['Imagability'].div(50)
df_img_just_words = df_img_just_words.rename(columns={'index': 'Word'})
df_img_just_words.tail()

Unnamed: 0,Word,0,1,2,3,5,7,8,9,11,...,51,52,53,54,55,57,58,59,60,Imagability
343,upbeat,1.0,5.0,3.0,2.0,4.0,4.0,5.0,7.0,4.0,...,5.0,5.0,5.0,2.0,4.0,3.0,6.0,2.0,6.0,3.84
344,recognition,3.0,5.0,2.0,3.0,4.0,3.0,1.0,7.0,3.0,...,5.0,7.0,4.0,1.0,2.0,3.0,2.0,2.0,5.0,3.36
345,vermin,7.0,7.0,6.0,6.0,3.0,7.0,7.0,7.0,1.0,...,6.0,7.0,4.0,4.0,3.0,5.0,5.0,5.0,7.0,4.78
346,waste,7.0,6.0,6.0,5.0,5.0,7.0,7.0,7.0,5.0,...,6.0,7.0,5.0,3.0,4.0,5.0,2.0,4.0,5.0,4.42
347,miserable,2.0,6.0,5.0,4.0,4.0,4.0,6.0,7.0,5.0,...,5.0,7.0,4.0,2.0,2.0,5.0,6.0,2.0,2.0,3.66


### Balancing the list

In [1012]:
df = pd.read_csv("Stimuli/WordsListF.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Word,MeanValence,ValenceStand,MeanArousal,type,letters,frequency,Concreteness,Img,pos,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,0,abandonment,2.63,2.63,4.95,0,11,49,2.54,3.48,noun,,,
1,1,ache,3.27,3.27,4.3,0,4,127,3.43,443.0,noun,,,
2,2,alone,3.85,3.85,4.0,0,5,15735,2.86,480.0,noun,,,
3,3,angst,3.5,3.5,5.76,0,5,47,1.96,,adj,,,
4,4,annoyance,2.95,2.95,4.1,0,9,25,2.14,,noun,,,


In [1013]:
df = df.merge(df_img_just_words, how = 'outer')
df = df[['Word', 'MeanValence', 'ValenceStand', 'MeanArousal', 'type', 'letters', 'frequency', 'Concreteness', 'pos', 'Imagability']]
df = df.drop_duplicates('Word') # we had a few repeats
df = df[pd.notnull(df['Imagability'])]# we didn't include "focus" in the MTurk study, so img is blank
df = df[pd.notnull(df['type'])]
df = df[df.Word != 'bummer']
df = df[df.Word != 'clingy']
df = df[df.Word != 'comatose']
df = df[df.Word != 'deadbeat']
df = df[df.Word != 'faker']
df = df[df.Word != 'hag']
df = df[df.Word != 'lowlife']
df = df[df.Word != 'mousy']
df = df[df.Word != 'scumbag']
df = df[df.Word != 'sleaze']
df = df[df.Word != 'wannabe']
df = df[df.Word != 'kisser']
df = df[df.Word != 'longevity']
df = df[df.Word != 'pizzazz']
df = df[df.Word != 'upbeat']
df = df[df.Word != 'angst']
df = df[df.Word != 'nutcase']
df = df[df.Word != 'newlywed'] # the LSA thing doesn't know these words...there's prob an easier way to do this...
df.tail()

Unnamed: 0,Word,MeanValence,ValenceStand,MeanArousal,type,letters,frequency,Concreteness,pos,Imagability
340,willingness,6.43,3.57,4.25,1.0,11.0,68.0,1.81,noun,3.16
341,winner,7.86,2.14,6.53,1.0,6.0,1592.0,3.21,noun,4.64
342,wise,7.42,,4.46,1.0,4.0,1452.0,1.97,adj,3.46
343,witty,7.25,2.75,5.65,1.0,5.0,163.0,2.21,adj,3.3
344,zest,6.76,3.24,5.41,1.0,4.0,35.0,2.27,noun,2.78


In [705]:
#gets two new random lists
def getnewneglist():
    sampled_dfneg = dfneg.sample(100)
    return sampled_dfneg

# creating a random sample of 100 positive words
def getnewposlist():
    sampled_dfpos = dfpos.sample(100)
    return sampled_dfpos


In [706]:
# checks the ttest for the new lists given a column name
def checknewsig(column_name):
    return ttest_ind(b[column_name], a[column_name])

In [711]:
# makes a df with the p's and t's of the words
def makepdf():
    pchart = pd.DataFrame({'p':[checknewsig('letters').pvalue, 
                                   checknewsig('MeanArousal').pvalue, 
                                  checknewsig('frequency').pvalue, 
                                   checknewsig('Concreteness').pvalue,
                                   checknewsig('Imagability').pvalue],
                          't':[checknewsig('letters').statistic, 
                                   checknewsig('MeanArousal').statistic, 
                                  checknewsig('frequency').statistic, 
                                   checknewsig('Concreteness').statistic,
                                  checknewsig('Imagability').statistic]}, 
                       index=['letters', 'MeanArousal', 'frequency','Concreteness', 'Imagability'])
    return pchart
    

In [712]:
# checks the given series to see if columns are > .05
def checkpdf(x):
    return x.loc['frequency', 'p'] > .05 and x.loc['MeanArousal','p'] > .05 and x.loc['letters','p'] > .05 and x.loc['Imagability','p'] > .001 and x.loc['Concreteness','p'] > .05
    

In [713]:
# not using this anymore but keep for reference

#def main():
    #a = getnewneglist()
    #b = getnewposlist()
    #if checkpdf(makepdf()) == True:
        #makepdf().to_csv('p_values.csv')
        #a.append(b).to_csv('balanced_words.csv')
    #else:
        #a = getnewneglist()
        #b = getnewposlist()
        #return main()

In [1201]:
ab = pd.read_csv('Stimuli/Word_options/balanced_words_option8_WORKING.csv')
a = ab[ab.type == 0]
b = ab[ab.type == 1]
print ('Negative Words:')
print (a.mean()) # negatives have a few more nouns
print ('--------------------')
print ('Positive Words:')
print (b.mean())

Negative Words:
Unnamed: 0       86.450000
MeanValence       2.759400
ValenceStand      2.825176
MeanArousal       4.672900
type              0.000000
letters           7.280000
frequency       739.640000
Concreteness      2.583900
Imagability       3.496400
dtype: float64
--------------------
Positive Words:
Unnamed: 0       257.480000
MeanValence        7.162500
ValenceStand       2.948101
MeanArousal        4.655100
type               1.000000
letters            7.420000
frequency       1233.210000
Concreteness       2.453000
Imagability        3.645600
dtype: float64


In [1202]:
# looking at part of speech....
print ('Negative Words:')
print (a.pos.describe())
print ('--------------------')
print ('Positive Words:')
print (b.pos.describe()) # needed to add more adj to the negative list, balanced now

Negative Words:
count      100
unique       2
top       noun
freq        58
Name: pos, dtype: object
--------------------
Positive Words:
count      100
unique       2
top       noun
freq        54
Name: pos, dtype: object


In [1203]:
makepdf() # look at significance

Unnamed: 0,p,t
letters,0.65918,0.441712
MeanArousal,0.88156,-0.149184
frequency,0.174274,1.363496
Concreteness,0.218651,-1.234039
Imagability,0.121848,1.553721


### Looking at LSA

http://lsa.colorado.edu/ 

In [1150]:
# add the negative LSA csv
lsa = pd.read_csv('Stimuli/Word_options/option8_lsa_neg_WORKING.csv')
values = lsa[lsa['Document']].values
lower_triangular = values[np.tril_indices(values.shape[0], -1)]
lsa = pd.DataFrame({'Neg': lower_triangular})

In [1196]:
# add the positive LSA csv
pos = pd.read_csv('Stimuli/Word_options/option8_lsa_pos_WORKING.csv')
values = pos[pos['Document']].values
lower_triangular = values[np.tril_indices(values.shape[0], -1)]
lsa['Pos'] = lower_triangular
lsa.count()

Neg    4950
Pos    4950
dtype: int64

In [1197]:
# some descriptives
lsa.describe()

Unnamed: 0,Neg,Pos
count,4950.0,4950.0
mean,0.10519,0.10818
std,0.100247,0.097665
min,-0.15,-0.14
25%,0.03,0.04
50%,0.09,0.09
75%,0.16,0.16
max,0.62,0.74


In [1198]:
ttest_ind(lsa['Neg'], lsa['Pos']) # are they different?

Ttest_indResult(statistic=-1.5030186968158052, pvalue=0.13286612570152892)