In [1]:
from __future__ import division
import pandas as pd
import os 
import sys
import numpy as np
import math
from collections import Counter
from glob import glob

## Overview
This notebook goes over reading in experiment events and recall data of some subjects. Pandas is a useful package in Python that helps you manage dataframes effectively (To read more, see https://pandas.pydata.org). Mainly using Pandas, we will go over reading in some csv data files and getting the most liked and disliked of the stimuli that were presented to the subjects. 

In order to count the likes and dislikes, we need the recall csv file and events file for each subject. Before we start the Python codes, let's make sure we have all the data in one place. We will copy some necessary files from the server to the cdcatmr directory for this part.

---

### 1. let's scp data
let's create a `test_data` folder inside `cdcatmr/data/` in which we will store all the rec.csv and events.csv files of every subject from the server.

1. First create `test_data`. You can open your data folder and create it. Or use terminal. cd into the cdcatmr/data directory and type: `mkdir test_data`. Then, make `events` and `rec` folders inside `test_data`. We will store all rec.csv files in `rec` and all events.csv files in `events`.
<br>

2. Now open terminal, cd into the `test_data` directory you just created. Then type: 
```
scp yourVUNet@memory.psy.vanderbilt.edu:'/data/beh/cdcatbeh/*/*rec.csv' ./rec
```
Notice it is just the dot at the end. The dot is basically telling your current location (`test_data`). If it works, it should move all the subject rec files from the server. 
<br>

3. Now let's get all the events files. In the same folder location, Type: 
```
scp jeonj1@memory.psy.vanderbilt.edu:’/data/beh/cdcatbeh/*/*events.csv’ ./events
```
<br>

4. Now let's remove some corrupted data. In both rec and events file, manually remove subjects 4, 27, 29, 37, and 63.

---

### 2. Set paths and read files
Now that we have all the files in one place, let's set the paths so the program knows where to find certain files. 

In [2]:
rootDir = os.getcwd()  # os.getcwd() is equivalent to pwd, showing the current path
# recursively set pwd as 'data' folder
while os.path.basename(rootDir) != 'cdcatmr':
    os.chdir('..')
    rootDir = os.getcwd()
dataDir = rootDir + '/data'  # from cdcatmr directory, add /data
stimDir = rootDir + '/stimuli'  # from cdcatmr directory, add /stimuli directory to access all the list of stimuli

os.chdir(stimDir)

# call in cdcatmr stim pool
with open('cel_names.txt', 'r') as f:
    cel_list = f.read().splitlines()  # this will give you all the list of celeb names
with open('loc_names.txt', 'r') as f:
    loc_list = f.read().splitlines()  # this will give you all the list of location names
with open('obj_names.txt', 'r') as f:
    obj_list = f.read().splitlines()  # this will give you all the list of object names
    
total_list = cel_list + loc_list + obj_list

recDir = dataDir + '/test_data/rec'
eventsDir = dataDir + '/test_data/events'

---
### 3. Locate all csv files
Now we locate all the csv files we want:

In [12]:
all_recalls = [y for x in os.walk(recDir) for y in glob(os.path.join(x[0], '*rec.csv'))]
all_events = [y for x in os.walk(eventsDir) for y in glob(os.path.join(x[0], '*cat_all_events.csv'))]
all_recalls.sort()
all_events.sort()

assert len(all_recalls) == len(all_events), "detected events and recalls csvs do not match"
print('Found ' + str(len(all_events)) + ' subjects.')

Found 81 subjects.


---
### 4. Loop over to count likes and dislikes
Now we can loop through each event file to find likes and dislikes.

In [6]:
# we first initialize empty lists
liked_items = []
disliked_items = []

# e here stands for each event file (all_events is a list of all the events.rec files)
for e in all_events:
    events = pd.read_csv(e)  # we read in the first events.csv file
    stims = []  
    for i in range (0, len(events)):
        if events.iloc[i]['types'] == 'stim_pres':  # now if the event type is a stimuli presentation
            stims.append(events.iloc[i])  # add to the unique stim list
    # at this point, stims will have all the events with just stim_pres

    # now we check individual stim_pres items to see if like/dislike was pressed
    for i in range(0, len(stims)):
        if stims[i]['resp'] == 'period':
            liked_items.append(stims[i]['item'])
        elif stims[i]['resp'] == 'slash':
            disliked_items.append(stims[i]['item'])

---
### 5. Get like/dislike percentage
Now at this point, `liked_items` will have all the liked images, and `disliked_items` will have all the dislikes. Typing `len(liked_items)` will show you the total likes. `liked_items[0]` will show you the first item that was liked. Knowing the total counts of likes and dislikes, we will get the percentage of each. 

In [7]:
# show proportion of liked vs disliked
liked_percent = len(liked_items) / (len(liked_items) + len(disliked_items))
disliked_percent = len(disliked_items) / (len(liked_items) + len(disliked_items))

print('likes %: ' + str(liked_percent))
print('dislikes %: ' + str(disliked_percent))

0.777108433735
0.222891566265


### 6. Finding most likes/dislikes items
Now let's try to see the most likes and dislikes of items. To implement this, we use a special function called `collections.Counter` that really simplifies the process. It will basically count up for you with all the unique items in a given list. 

In [8]:
# let's see the most liked/disliked items
likes = Counter(liked_items)

# get the first ten items
likes.most_common()[0:10]

[('Niagara Falls', 74),
 ('Great Wall of China', 73),
 ('Barack Obama', 72),
 ('dollar', 70),
 ('Florence', 70),
 ('bicycle', 70),
 ('Swiss Alps', 70),
 ('Amazon River', 69),
 ('Times Square', 69),
 ('book', 68)]

In [9]:
dislikes = Counter(disliked_items)

# get the top ten items
dislikes.most_common()[0:10]

[('hurdle', 39),
 ('cigar', 37),
 ('trash can', 36),
 ('Alec Baldwin', 35),
 ('Ricky Martin', 35),
 ('brick', 35),
 ('cradle', 34),
 ('Indianapolis Speedway', 33),
 ('Julianne Moore', 33),
 ('satellite dish', 33)]

---
---
### Finding if liking leads to recalling better:
Warning: it takes a while to run the code. Run at your own risk :)

In [None]:
recalls = pd.read_csv(all_recalls[0])
recalls.head()
os.path.basename(all_recalls[0])[:-7]

In [None]:
for i in range(0, len(all_recalls)):
    recall_name = os.path.basename(all_recalls[i])[:-7]
    events_name = os.path.basename(all_events[i])[:-18]
    if not recall_name == events_name:
        print("file don't match: " + recall_name + ' | ' + events_name)

In [None]:
liked_recall = []
disliked_recall = []

for s in range(0, len(all_recalls)):
    recall = pd.read_csv(all_recalls[s])
    events = pd.read_csv(all_events[s])
    print('subject ' + str(s))
    
    for r in range(0, len(recalls)):
        if recalls.iloc[r]['intrusion'] == 0:
            recalled_item = recalls.iloc[r]['item']
            
            for e in range(0, len(events)):
                if events.iloc[e]['item'] == recalled_item:
                    if events.iloc[e]['resp'] == 'period':
                        liked_recall.append(events.iloc[e])
                        break
                    elif events.iloc[e]['resp'] == 'slash':
                        disliked_recall.append(events.iloc[e])
                        break
                    
print(len(liked_recall))  # 6714
print(len(disliked_recall))  # 2001

In [None]:
count_likes = 0
count_dislikes = 0
for e in range(0, len(all_events)):
    events = pd.read_csv(all_events[e])

    for i in range(0, len(events)):
        if events.iloc[i]['types'] == 'stim_pres':
            if events.iloc[i]['resp'] == 'period':
                count_likes += 1
            elif events.iloc[i]['resp'] == 'slash':
                count_dislikes += 1
print(count_likes)  # 12642
print(count_dislikes)  # 3626
print(6714
print(len(disliked_recall))  # 2001

In [None]:
print(6714/12642)
print(2001/3626)

In [None]:
all_recalls = all_recalls[0:2]
all_events = all_events[0:2]
len(all_recalls)

In [None]:
liked_recall = []
disliked_recall = []
liked_not_recall = []
disliked_not_recall = []

for s in range(0, len(all_recalls)):
    recall = pd.read_csv(all_recalls[s])
    events = pd.read_csv(all_events[s])
    print('subject ' + str(s))
    for e in range(0, len(events)):
        if events.iloc[e]['types'] == 'stim_pres':
            print('events line '+ str(e))
            presented_item = events.iloc[e]['item']
            keypress = events.iloc[e]['resp']
            item_trial = events.iloc[e]['trialN']
            
            recalled = False
            for r in range(0, len(recalls)):
#                 print('recall line ' + str(r))
                if recalls.iloc[r]['trialN'] == item_trial:
                    recalled_item = recalls.iloc[r]['item']
                    if recalled_item == presented_item:
                        if keypress == 'period':
                            liked_recall.append(events.iloc[e])
                            recalled = True
                            continue
                        elif keypress == 'slash':
                            disliked_recall.append(events.iloc[e])
                            recalled = True
                            continue
                if recalls.iloc[r]['trialN'] != item_trial:
                    recalled = False
                    if keypress == 'period':
                        liked_not_recall.append(events.iloc[e])
                        continue
                    elif keypress == 'slash':
                        disliked_not_recall.append(events.iloc[e])
                        continue
                        
print(len(liked_recall))
print(len(disliked_recall))
print(len(liked_not_recall))
print(len(disliked_not_recall))

In [None]:
print(len(liked_recall))
print(len(disliked_recall))
print(len(liked_not_recall))
print(len(disliked_not_recall))

In [None]:
for 

In [None]:
stims = []
for i in range (0, len(events)):
    if events.iloc[i]['types'] == 'stim_pres':
        stims.append(events.iloc[i])

In [None]:
count = 0

In [None]:
for i in range (0, len(stims)):
    if stims[i]['resp'] == 'period':
        count = count + 1

In [None]:
count

In [None]:
countSlash = 0
disliked_items = []

In [None]:
for i in range (0, len(stims)):
    if stims[i]['resp'] == 'period':
        count = count + 1
for i in range (0, len(stims)):
    if stims[i]['resp'] == 'slash':
        countSlash = countSlash + 1

In [None]:
countSlash

In [None]:
countNan = 0

In [None]:
countNan = 0
for i in range (0, len(stims)):
    try:
        if math.isnan(stims[i]['resp']):
            countNan = countNan + 1
    except TypeError:
        pass
countNan

In [None]:
liked_items = []
disliked_items = []
for i in range(0, len(stims)):
    if stims[i]['resp'] == 'period':
        liked_items.append(stims[i]['item'])
    elif stims[i]['resp'] == 'slash':
        disliked_items.append(stims[i]['item'])
print(len(liked_items))
print(len(disliked_items))

In [None]:
recalls = pd.read_csv(subjDir + '/cdcatbeh078rec.csv')

In [None]:
disliked_items