In [1]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.preprocessing import LabelEncoder

In [2]:
# Constants 
extent_of_agree_disagree = ['Strongly disagree', 'Disagree', 'Somewhat disagree', 'Neither agree nor disagree', 'Somewhat agree', 'Agree', 'Strongly agree']
well_not_well = ['Not well at all', 'Slightly well', 'Moderately well', 'Very well', 'Extremely well']
extent_of_tired_not_tired = ['Extremely tired', 'Very tired', 'Moderately tired', 'Slightly tired', 'Not tired at all']
extent_of_stress_not_stress = ['Not stressed at all', 'A little stressed', 'Somewhat stressed', 'Stressed', 'Very stressed', 'Extremely stressed']
midterms = ["No.", "Yes - 1 midterm", "Yes - 2 midterms", "Yes - 3 or more midterms"]
activities_before_sleep = ["Read (on an electronic device)", "Alcohol", "A large meal", "Tea / Coffee (with caffeine)", 
    "Played video games", "Skyped / called a friend or family member back home", "Studied","I exercized", "Watched TV", "Spent time on my phone.", "I had/did none of these in the 2 hours before I went to sleep."]
agree_or_not_agree_qs = ['Today I felt emotionally drained.', 'Today I felt stressed.', 'I found it difficult to concentrate on my work today.', 'I felt competent today.', 'I felt happy today.', 'I enjoyed my work today.']

# Encoding
extent_of_agree_disagree_encoder = LabelEncoder()
extent_of_agree_disagree_encoder.fit(extent_of_agree_disagree)
well_not_well_encoder = LabelEncoder()
well_not_well_encoder.fit(well_not_well)
extent_of_tired_not_tired_encoder = LabelEncoder()
extent_of_tired_not_tired_encoder.fit(extent_of_tired_not_tired)
extent_of_stress_not_stress_encoder = LabelEncoder()
extent_of_stress_not_stress_encoder.fit(extent_of_stress_not_stress)
midterms_encoder = LabelEncoder()
midterms_encoder.fit(midterms)

LabelEncoder()

In [3]:
def split_on_empty_lines(s):

    # greedily match 2 or more new-lines
    blank_line_regex = r"(?:\r?\n){2,}"

    return re.split(blank_line_regex, s.strip())

# Get file names ended with txt in ./data
files = [f for f in os.listdir('./Assignment_2/data_banghua/') if f.endswith('.txt')]
# Remove 10-17_17:08.txt from the list
# files.remove('10-17_17:08.txt')
# files.remove('10-17_12:59.txt')
# Read in data
data = []
for file in files:
    print("Reading in file: " + file)
    with open('./Assignment_2/data_banghua/' + file) as f:
        file_data = split_on_empty_lines(f.read())
        data.append(file_data)

Reading in file: 10-19_10:35.txt
Reading in file: 10-20_12:44.txt
Reading in file: 10-17_17:08.txt
Reading in file: 10-27_10:32.txt
Reading in file: 10-23_11:32.txt
Reading in file: 10-22_17:19.txt
Reading in file: 10-19_02:24.txt
Reading in file: 10-26_10:27.txt
Reading in file: 10-25_10:28.txt
Reading in file: 10-24_11:00.txt
Reading in file: 10-21_12:10.txt


In [4]:
del_qs = ['Response Summary: ', 'Timing', 'Which section are you in?', 'Have you entered data for this survey before?',
    'How many hours of sleep do you need to feel well-rested?']
df = pd.DataFrame()

for file in data:
    temp = {}
    for i in range(len(file)):
        q_a = file[i].split('\n   ')
        if q_a[0] not in del_qs:
            if q_a[0] == 'How many hours sleep did you get last night?':
                temp['How many hours sleep did you get last night?'] = float(q_a[1].split('   ')[1])
            elif q_a[0] == 'How well did you sleep last night?':
                temp['How well did you sleep last night?'] = well_not_well_encoder.transform([q_a[1]])[0]
            elif q_a[0] == 'Did you have or do any of the following 2 hours before you went to sleep \nlast night? Select all t...':
                for activity in activities_before_sleep:
                    temp[activity] = 1 if activity in q_a[1] else 0
            elif q_a[0] == 'Before going to sleep last night, for how many hours did you stop working? Please fill in a number:':
                temp['Before going to sleep last night, for how many hours did you stop working? \nPlease fill in a number:'] = float(q_a[1])
            elif q_a[0] == 'How tired did you feel today?':
                temp['How tired did you feel today?'] = extent_of_tired_not_tired_encoder.transform([q_a[1]])[0]
            elif q_a[0] == 'Did you have a midterm today?':
                temp['Did you have a midterm today?'] = midterms_encoder.transform([q_a[1]])[0]
            elif q_a[0] == 'How long did it take you to complete the task (in minutes and seconds, \ne.g., 1:32 = 1 minute and...':
                try:
                    temp['How long did it take you to complete the task (seconds)'] = float(q_a[1].split(':')[0])*60+float(q_a[1].split(':')[1])
                except:
                    temp['How long did it take you to complete the task (seconds)'] = float(q_a[1].split(' ')[0])*60+float(q_a[1].split(' ')[3])
            elif q_a[0] == 'How many card turns did it take you to complete the task?':
                temp['How many card turns did it take you to complete the task?'] = float(q_a[1])
            elif q_a[0] == 'How stressed did you feel today?':
                temp['How stressed did you feel today?'] = extent_of_stress_not_stress_encoder.transform([q_a[1]])[0]
            elif q_a[0] == 'How well did you feel you were able to cope with your stress today?':
                temp['How well did you feel you were able to cope with your stress today?'] = well_not_well_encoder.transform([q_a[1]])[0]
            elif q_a[0] == 'Please indicate the extent to which you agree / disagree with the \nfollowing statements':
                for i in range(1, len(q_a), 2):
                    temp[q_a[i]] = extent_of_agree_disagree_encoder.transform([q_a[i+1].strip()])[0]
            else:
                temp[q_a[0]] = q_a[1]
    # Convert temp, which is a dict, to a dataframe
    temp_df = pd.DataFrame(temp, index=[0])
    # Concat datasets together
    df = pd.concat([df, temp_df], ignore_index=True)

In [5]:
df.shape

(11, 29)

In [6]:
df.to_csv('data_banghua.csv', index=False)