In [1]:
import numpy as np
import os
import pandas as pd
import psycopg2

from urllib.parse import urlparse
from urllib import parse
from lib.build_export_features import MixedFeatureData


%matplotlib inline
FOLLOWER_URL = os.environ['FOLLOWER_URL']
                          
def make_connection():
    url_output = FOLLOWER_URL
    url = urlparse(url_output)
    conn = psycopg2.connect(
        database=url.path[1:],
        user=url.username,
        password=url.password,
        host=url.hostname
    )
    return conn

# START HERE
## The cell below contains all the mission specific data. Here you will specifity the mission id, the independent variables, the dependent variable, and the definitions of positive and negative outcome.

In [2]:
# CHANGE THESE VALUES
mission_id = "25614"
# ONLY USE SINGLE QUESTIONS FOR DEPENDENT AND INDEPENDENT VARIABLES IN FORMAT 'PART_NUM-QUESTION_NUM'
# ALL "4-4", "4-5","4-6", "4-10","4-11","4-12", "4-13","4-14","4-15","4-17","4-18","4-19","4-21"
continuous_independent_variables = [
    "4-4", "4-5","4-6", "4-10","4-11","4-12", "4-13","4-14","4-15","4-17","4-18","4-19","4-21"
]
# NOTE MULTIPLE QUESTIONS NEED TO BE CATEGORICAL
categorical_independent_variables = []
binary_independent_variables = []
dependent_variable = "4-2"
negative_outcomes = [
"Moderately satisfied",
"Slightly satisfied",
"Neither satisfied nor dissatisfied",
"Slightly dissatisfied",
"Moderately dissatisfied",
"Extremely dissatisfied"
]
positive_outcomes = ["Extremely satisfied"]
demo_independent_variables = []
tag_independent_variables = []
#mutually exclusive scout groups only at this time
scout_group_independent_variables = []
#response_data = pd.read_pickle("mission_" + mission_id + "_data.pkl")
question_response_filtering = {}
grouping = 'user_id'
ethnicity_filters = []
education_filters = []
tag_filters = []
scout_group_filters = ["Pixel","Apple","Samsung","Bose"]

# After entering the information above, go to Cell -> Run All to see your regression results

In [3]:
cnc = make_connection()
questions = pd.read_sql_query(
    """with m_questions as (select id as question_id, label, type, position as question_position,
                            part_id, structure from questions where type in 
                            ('SingleQuestion', 'MultipleQuestion', 'NumberQuestion')
                            and mission_id = """
    + mission_id
    + """),
       m_parts as (select id as part_id, position as part_position from parts where
                   mission_id = """
    + mission_id
    + """)
    select question_id, label, type, question_position, m_parts.part_id, part_position,
    structure from m_questions join m_parts on m_questions.part_id = m_parts.part_id""",
    cnc,
)
print("collected questions")

question_list = (
    "(" + ",".join([str(a) for a in questions["question_id"].tolist()]) + ")"
)
snippets = pd.read_sql_query(
    """select id as snippet_id, user_id, assignment_id from snippets where mission_id = """
    + mission_id,
    cnc,
)
print("collected snippets")
scout_group_names = (
    "("
    + ",".join(["'" + str(sg) + "'" for sg in scout_group_independent_variables])
    + ")"
)
assignments = pd.read_sql_query(
    """select id as assignment_id, user_id from assignments where 
          mission_id = """
    + mission_id
    ,cnc)
assignment_list = "(" + ','.join([str(aid) for aid in assignments['assignment_id'].unique()]) +")"
assignment_groups = pd.read_sql_query(
    """select assignment_id, scout_group_id from assignments_scout_groups
    where assignment_id in """ + assignment_list,
    cnc
)
groups = pd.read_sql_query(
    """select id as scout_group_id, name as scout_group from scout_groups
    """,
    cnc)
scout_groups = pd.merge(
    assignment_groups,
    groups[groups['scout_group'].isin(scout_group_filters)],
    on='scout_group_id'
)
print("collected scout groups")


snippet_list = (
    "(" + ",".join([str(a) for a in snippets["snippet_id"].tolist()]) + ")"
)
responses = pd.read_sql_query(
    """select  id as response_id, snippet_id, question_id, 
    answers from responses where snippet_id in """
    + snippet_list,
    cnc,
)
print("collected responses")
snippets_tags = pd.read_sql_query(
    """Select tag_id, snippet_id from snippets_tags where snippet_id in """
    + snippet_list,
    cnc,
)
tag_list = "(" + ",".join([str(a) for a in snippets_tags["tag_id"].tolist()]) + ")"
tags = pd.read_sql_query(
    """select id as tag_id, name as tag from tags where id in """ + tag_list, cnc
)

snippets_tags = pd.merge(snippets_tags, tags, on="tag_id")
tagged_snippets = pd.merge(snippets, snippets_tags, on="snippet_id", how="left")
user_list = "(" + ",".join([str(a) for a in assignments["user_id"].tolist()]) + ")"
users = pd.read_sql_query(
    """select id as user_id, ethnicity, education, household_income, birthday, gender from users where
                                 id in """
    + user_list,
    cnc,
)
user_snippets = pd.merge(tagged_snippets, users, on="user_id", how="left")
user_snippets = pd.merge(user_snippets, scout_groups, on="assignment_id", how="left")
response_data = pd.merge(responses, questions, on="question_id")
response_data = pd.merge(response_data, user_snippets, on="snippet_id", how="left")
cnc.close

collected questions
collected snippets
collected scout groups
collected responses


<function connection.close>

In [4]:
response_data['birthday'] = response_data['birthday'].astype('datetime64[ns]')

In [5]:
fd = MixedFeatureData(
    response_data,
    dependent_variable,
    continuous_independent_variables,
    binary_independent_variables,
    categorical_independent_variables,
    positive_outcomes,
    negative_outcomes,
    demo_independent_variables,
    tag_independent_variables,
    scout_group_independent_variables,
    grouping,
)

processing


In [9]:
for i, part_df in enumerate(fd.export_list):
    
    pd.merge(part_df,
             user_snippets,
             on='snippet_id').to_csv('mission_'+mission_id+'_part_'+str(i+1)+'_data.csv')

In [8]:
fd.export_list[2]

Unnamed: 0,snippet_id,3-11,3-12,3-13,3-2,3-3,3-4,3-5,3-6,3-7,...,3-2_Relaxing,3-2_Traveling longer distances,3-2_Watching TV or movies,3-2_other,3-4_Companion app for the earbud,"3-4_Gesture/touch controls (e.g., swipe, tap, etc.)",3-4_None of these,"3-4_Removable/adjustable features designed to help fit the earbuds (e.g., ear tips, stabilizers, cord loops, etc.)","3-4_Standalone media playing (i.e., storing music directly on the earbuds)","3-4_Voice assistant (e.g., Siri, Bixby, Google Assistant, etc.)"
0,1415044,No,,,Getting work done,Listened to music/podcasts,None of these,60,Extremely satisfied,No,...,0,0,0,0,0,0,1,0,0,0
1,1415142,No,,,Relaxing,Listened to music/podcasts,"Standalone media playing (i.e., storing music ...",18,Extremely satisfied,No,...,1,0,0,0,0,0,0,0,1,0
2,1415253,Yes,3,3,Exercising,Listened to music/podcasts,"Voice assistant (e.g., Siri, Bixby, Google Ass...",90,Extremely satisfied,No,...,1,1,1,0,0,1,0,1,0,1
3,1415276,No,,,Getting work done,Listened to music/podcasts,"Voice assistant (e.g., Siri, Bixby, Google Ass...",6,Extremely satisfied,No,...,0,0,0,0,0,1,0,0,0,1
4,1415293,Yes,1,3,Relaxing,Listened to music/podcasts,"Voice assistant (e.g., Siri, Bixby, Google Ass...",24,Moderately satisfied,Yes,...,1,0,1,0,0,0,0,0,0,1
5,1415312,No,,,Relaxing,Listened to music/podcasts,None of these,15,Moderately satisfied,Yes,...,1,0,0,0,0,0,1,0,0,0
6,1415326,No,,,Getting work done,Listened to music/podcasts,Removable/adjustable features designed to help...,12,Extremely satisfied,No,...,0,0,0,1,0,0,0,1,0,0
7,1415334,No,,,Exercising,Listened to music/podcasts,"Standalone media playing (i.e., storing music ...",2,Moderately satisfied,No,...,0,0,0,0,1,0,0,0,1,1
8,1415355,No,,,Getting work done,Listened to music/podcasts,Removable/adjustable features designed to help...,12,Extremely satisfied,No,...,0,0,0,1,0,0,0,1,0,0
9,1415356,No,,,Relaxing,Listened to music/podcasts,Companion app for the earbud,60,Extremely satisfied,No,...,1,0,1,0,1,0,0,0,0,0
