In [7]:
import pandas as pd 
import numpy as np 

data = pd.read_pickle('data/pitt_full_interview.pickle')
print(data.head())
print(len(data))

      id    label                                               text
0  138-1  Control  [\n, okay, \n, there, s, a, cookie, jar, on, t...
1  631-0  Control  [\n, the, kids, are, in, the, cookies, \n, the...
2  182-3  Control  [\n, well, Johnny, s, fallin, g, off, the, sto...
3  121-0  Control  [\n, alright, \n, the, boy, is, taking, a, coo...
4  142-3  Control  [\n, mhm, \n, well, the, water, s, running, ov...
1292


In [3]:
anagraphic_data = pd.read_pickle('data/anagraphic_dataframe.pickle')
print(anagraphic_data.head())

   age  education     id  race  sex
0   57         14  001-0     1    1
1   58         14  001-1     1    1
2   59         14  001-2     1    1
3   60         14  001-3     1    1
4   58         16  002-0     1    0


In [9]:
merged_dataframe = pd.merge(data, anagraphic_data, on='id')
print(merged_dataframe)

         id     label                                               text  age  \
0     138-1   Control  [\n, okay, \n, there, s, a, cookie, jar, on, t...   68   
1     631-0   Control  [\n, the, kids, are, in, the, cookies, \n, the...   74   
2     121-0   Control  [\n, alright, \n, the, boy, is, taking, a, coo...   70   
3     142-3   Control  [\n, mhm, \n, well, the, water, s, running, ov...   61   
4     267-2   Control  [\n, clears, throat, mother, is, um, drying, t...   54   
5     105-1   Control  [\n, okay, \n, well, the, mother, is, drying, ...   58   
6     105-0   Control  [\n, alrightie, \n, uh, the, action, I, see, i...   56   
7     140-0   Control  [\n, doesn, t, matter, where, you, start, then...   58   
8     298-1   Control  [\n, okay, \n, the, boy, is, standing, up, try...   66   
9     121-1   Control  [\n, there, s, the, action, \n, there, s, the,...   71   
10    243-0   Control  [\n, okay, the, picture, s, in, a, kitchen, uh...   66   
11    142-0   Control  [\n, 

In [3]:
print(len(data))
print(data.head())

1292
     label                                               text
0  Control  [\n, okay, \n, there, s, a, cookie, jar, on, t...
1  Control  [\n, the, kids, are, in, the, cookies, \n, the...
2  Control  [\n, well, Johnny, s, fallin, g, off, the, sto...
3  Control  [\n, alright, \n, the, boy, is, taking, a, coo...
4  Control  [\n, mhm, \n, well, the, water, s, running, ov...


In [11]:
# noinspection PyUnresolvedReferences
from feature_sets.psycholinguistic import get_psycholinguistic_features
import nltk 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm

sid = SentimentIntensityAnalyzer()

new_dataframe = []
for index, row in tqdm(merged_dataframe.iterrows()): 
    single_sentence_list = []
    
    string = ''
    for token in row.text:
        if token == '\n':
            single_sentence_list.append(string)
            string = ''
        else: 
            string += ' ' + token 
        
    counter = 0 
    comp_sentiment_sum = 0
    for sentence in single_sentence_list:
        ss = sid.polarity_scores(sentence)
        comp_sentiment_sum += ss['compound']
        counter += 1 
    
    if counter != 0:
        average_sentiment = comp_sentiment_sum/counter
    else: 
        average_sentiment = 0 
        
    ## for each interview in the dataset. 
    interview = nltk.pos_tag(row.text, lang='eng')
    
    final_interview = []
    for uttr in interview:
        final_interview.append({'token': uttr[0],'pos':uttr[1]})
    
    dict = get_psycholinguistic_features(final_interview)
    
    dict['average_sentiment'] = average_sentiment
    
    additional_features = []
    
    for  key,value in dict.items():
        additional_features.append(value)
    
    ##Here we take in consideration anagraphic features. 
    
    anagraphic_features = [row.age,row.education,row.race,row.sex]
    
    dict['features'] = additional_features + anagraphic_features
    dict['label'] = row.label 
    dict['text'] = row.text
    dict['mmse']=row.mmse
    
    new_dataframe.append(dict)




0it [00:00, ?it/s]

3it [00:00, 27.18it/s]

9it [00:00, 32.35it/s]

12it [00:00, 31.52it/s]

15it [00:00, 29.41it/s]

19it [00:00, 31.52it/s]

25it [00:00, 36.51it/s]

30it [00:00, 39.54it/s]

36it [00:00, 43.21it/s]

41it [00:00, 44.46it/s]

47it [00:01, 48.17it/s]

53it [00:01, 50.17it/s]

59it [00:01, 52.57it/s]

65it [00:01, 50.16it/s]

71it [00:01, 50.26it/s]

77it [00:01, 51.19it/s]

84it [00:01, 54.06it/s]

90it [00:01, 47.48it/s]

96it [00:02, 48.96it/s]

103it [00:02, 52.68it/s]

110it [00:02, 52.62it/s]

116it [00:02, 31.85it/s]

123it [00:02, 37.77it/s]

128it [00:02, 38.90it/s]

133it [00:03, 21.16it/s]

139it [00:03, 25.67it/s]

147it [00:03, 31.73it/s]

155it [00:03, 38.64it/s]

163it [00:03, 45.41it/s]

170it [00:03, 50.33it/s]

179it [00:03, 56.49it/s]

186it [00:04, 53.08it/s]

193it [00:04, 47.26it/s]

199it [00:04, 48.67it/s]

205it [00:04, 50.94it/s]

211it [00:05, 24.55it/s]

222it [00:05, 31.70it/s]

230it [00:05, 38.57it/s]

238it [00:05, 33.44it/s]

247it [00:05, 40.54it/s]

255it [00:05, 46.97it/s]

262it [00:05, 50.66it/s]

272it [00:06, 58.61it/s]

281it [00:06, 63.54it/s]

289it [00:06, 42.00it/s]

295it [00:06, 40.93it/s]

306it [00:06, 50.09it/s]

316it [00:06, 58.55it/s]

324it [00:07, 57.33it/s]

333it [00:07, 63.30it/s]

344it [00:07, 72.35it/s]

353it [00:07, 76.30it/s]

363it [00:07, 81.31it/s]

372it [00:07, 67.01it/s]

380it [00:08, 35.23it/s]

389it [00:08, 43.00it/s]

396it [00:08, 34.86it/s]

402it [00:08, 25.83it/s]

411it [00:08, 32.79it/s]

417it [00:09, 35.27it/s]

424it [00:09, 41.25it/s]

433it [00:09, 47.18it/s]

443it [00:09, 55.78it/s]

453it [00:09, 62.72it/s]

461it [00:09, 63.97it/s]

469it [00:09, 68.06it/s]

477it [00:09, 68.24it/s]

486it [00:10, 69.94it/s]

495it [00:10, 72.33it/s]

503it [00:10, 41.90it/s]

509it [00:10, 36.20it/s]

514it [00:10, 29.29it/s]

524it [00:11, 37.10it/s]

534it [00:11, 34.53it/s]

541it [00:11, 40.70it/s]

549it [00:11, 47.60it/s]

558it [00:11, 55.04it/s]

565it [00:11, 58.29it/s]

574it [00:11, 64.76it/s]

582it [00:12, 67.36it/s]

597it [00:12, 79.10it/s]

607it [00:12, 79.67it/s]

616it [00:12, 65.82it/s]

624it [00:12, 41.59it/s]

631it [00:13, 30.15it/s]

641it [00:13, 37.79it/s]

652it [00:13, 47.05it/s]

660it [00:14, 23.52it/s]

667it [00:14, 29.07it/s]

673it [00:14, 34.39it/s]

681it [00:14, 40.96it/s]

689it [00:14, 47.90it/s]

697it [00:14, 53.10it/s]

704it [00:14, 47.53it/s]

710it [00:15, 44.58it/s]

716it [00:15, 32.34it/s]

725it [00:15, 39.89it/s]

732it [00:15, 45.60it/s]

741it [00:15, 52.09it/s]

751it [00:15, 59.32it/s]

762it [00:15, 68.35it/s]

773it [00:15, 74.69it/s]

782it [00:16, 63.65it/s]

791it [00:16, 69.21it/s]

800it [00:16, 73.94it/s]

809it [00:16, 77.10it/s]

818it [00:16, 69.49it/s]

826it [00:16, 69.04it/s]

834it [00:16, 66.12it/s]

842it [00:16, 69.39it/s]

850it [00:17, 69.24it/s]

858it [00:17, 69.30it/s]

867it [00:17, 72.83it/s]

878it [00:17, 80.68it/s]

887it [00:17, 79.61it/s]

898it [00:17, 85.88it/s]

908it [00:17, 86.61it/s]

918it [00:17, 89.68it/s]

928it [00:17, 91.62it/s]

938it [00:18, 90.43it/s]

948it [00:18, 83.31it/s]

962it [00:18, 91.07it/s]

972it [00:18, 86.07it/s]

981it [00:18, 51.27it/s]

990it [00:18, 56.59it/s]

998it [00:19, 59.21it/s]

1006it [00:19, 62.73it/s]

1014it [00:19, 65.64it/s]

1022it [00:23,  5.73it/s]

1031it [00:23,  7.95it/s]

1038it [00:23, 10.82it/s]

1045it [00:24, 13.01it/s]

1053it [00:24, 17.33it/s]

1059it [00:24, 21.75it/s]

1065it [00:24, 23.76it/s]

1071it [00:24, 28.34it/s]

1079it [00:24, 35.07it/s]

1087it [00:24, 42.14it/s]

1097it [00:24, 50.92it/s]

1105it [00:25, 57.02it/s]

1115it [00:25, 63.06it/s]

1122it [00:25, 44.35it/s]




In [12]:
### Word correctness 
final_dataframe = pd.DataFrame(new_dataframe)
print(final_dataframe.head())

   average_sentiment                                           features  \
0           0.057656  [314.0, 0.0, 232.5, 1.395, 0.0, 0.05765625, 68...   
1          -0.019138  [471.0, 0.0, 348.75, 2.0925000000000002, 0.0, ...   
2           0.171547  [502.4, 0.0, 372.0, 2.232, 0.0, 0.171547368421...   
3           0.094989  [314.0, 0.0, 232.5, 1.395, 0.0, 0.094988888888...   
4          -0.044973  [0.0, 0.0, 0.0, 0.0, 0.0, -0.04497272727272727...   

   getAoaScore  getConcretenessScore  getFamiliarityScore  \
0       1.3950                   0.0                314.0   
1       2.0925                   0.0                471.0   
2       2.2320                   0.0                502.4   
3       1.3950                   0.0                314.0   
4       0.0000                   0.0                  0.0   

   getImagabilityScore  getSUBTLWordScores    label  \
0               232.50                 0.0  Control   
1               348.75                 0.0  Control   
2               3

In [14]:
import pickle
with open('data/pitt_full_interview_features.pickle', 'wb') as f:
    pickle.dump(final_dataframe, f)