In [1]:
import numpy as np
import os
import pandas as pd
import psycopg2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn import preprocessing
from urllib.parse import urlparse
from urllib import parse
from lib.build_regressions_features_demo import MixedFeatureData
from lib.explore_data import ExploratoryAnalysis
from lib.regressions import MixedClassificationModel

%matplotlib inline
FOLLOWER_URL = os.environ['FOLLOWER_URL']
                          
def make_connection():
    url_output = FOLLOWER_URL
    url = urlparse(url_output)
    conn = psycopg2.connect(
        database=url.path[1:],
        user=url.username,
        password=url.password,
        host=url.hostname
    )
    return conn

# START HERE
## The cell below contains all the mission specific data. Here you will specifity the mission id, the independent variables, the dependent variable, and the definitions of positive and negative outcome.

In [2]:
# CHANGE THESE VALUES
mission_id = "24970"
# ONLY USE SINGLE QUESTIONS FOR DEPENDENT AND INDEPENDENT VARIABLES IN FORMAT 'PART_NUM-QUESTION_NUM'
continuous_independent_variables = ["2-7", "2-9"]
# NOTE MULTIPLE QUESTIONS NEED TO BE CATEGORICAL
categorical_independent_variables = ["2-3"]
binary_independent_variables = ["2-8", "2-10", "2-16"]
dependent_variable = "2-14"
negative_outcomes = [
    "Slightly satisfied",
    "Neither satisfied nor dissatisfied",
    "Slightly dissatisfied",
    "Moderately dissatisfied",
    "Extremely dissatisfied",
]
positive_outcomes = ["Extremely satisfied", "Moderately satisfied"]
demo_independent_variables = ['age','household_income']
tag_independent_variables = ['Pixel','AirPods']
response_data = pd.read_pickle("mission_" + mission_id + "_data.pkl")
question_response_filtering = {'2-4':'Listening to music/podcasts'}
grouping = 'snippet_id'
ethnicity_filters = []
education_filters = []
tag_filters = []

# After entering the information above, go to Cell -> Run All to see your regression results

In [3]:
cnc = make_connection()
questions = pd.read_sql_query(
    """with m_questions as (select id as question_id, label, type, position as question_position,
                                 part_id, structure from questions where type in ('SingleQuestion', 'MultipleQuestion', 'NumberQuestion')
                                 and mission_id = """
    + mission_id
    + """),
                                 m_parts as (select id as part_id, position as part_position from parts where
                                 mission_id = """
    + mission_id
    + """)
                                 select question_id, label, type, question_position, m_parts.part_id, part_position,
                                 structure from m_questions join m_parts on m_questions.part_id = m_parts.part_id""",
    cnc,
)

question_list = (
    "(" + ",".join([str(a) for a in questions["question_id"].tolist()]) + ")"
)
responses = pd.read_sql_query(
    """select id as response_id, snippet_id, question_id, answers from responses where 
                                 question_id in """
    + question_list,
    cnc,
)
snippet_list = "(" + ",".join([str(a) for a in responses["snippet_id"].tolist()]) + ")"
snippets = pd.read_sql_query(
    """select id as snippet_id, user_id from snippets where id in """ + snippet_list,
    cnc,
)
snippets_tags = pd.read_sql_query("""Select tag_id, snippet_id from snippets_tags where snippet_id in """+ snippet_list,cnc)
tag_list = "(" + ",".join([str(a) for a in snippets_tags["tag_id"].tolist()]) + ")"
tags = pd.read_sql_query("""select id as tag_id, name as tag from tags where id in """ + tag_list,cnc)

snippets_tags = pd.merge(snippets_tags, tags, on='tag_id')
snippets = pd.merge(snippets, snippets_tags, on='snippet_id')
user_list = "(" + ",".join([str(a) for a in snippets["user_id"].tolist()]) + ")"
users = pd.read_sql_query(
    """select id as user_id, ethnicity, education, household_income, birthday, gender from users where
                                 id in """
    + user_list,
    cnc,
)
user_snippets = pd.merge(snippets, users, on='user_id', how='left')
response_data = pd.merge(responses, questions, on="question_id")
response_data = pd.merge(response_data, user_snippets, on="snippet_id")
cnc.close

<function connection.close>

In [4]:
response_data.to_pickle('mission_'+mission_id+'_data.pkl')

In [None]:
id_sample = response_data.groupby(['snippet_id']).count().reset_index().sample(frac=1)['snippet_id'].tolist()

In [None]:
if ethnicity_filters:
    ethnicities = ethnicity_filters
else:
    ethnicites = response_data['ethnicity'].unique()

if education_filters:
    educations = education_filters
else:
    educations = response_data['education'].unique()
if tag_filters:
    tags = tag_filters
else:
    tags = response_data['tag'].unique()

filtered = response_data[(response_data['ethnicity'].isin(ethnicites)) & 
                        (response_data['education'].isin(educations)) &
                        (response_data['tag'].isin(tags))].copy()
filtered_id_list = []
for question in question_response_filtering:
    part, num = int(question.split('-')[0])-1, int(question.split('-')[1])-1
    response = question_response_filtering[question]
    ids = filtered [(filtered['part_position'] == part) &
                        (filtered['question_position'] == num) &
                        (any([response in x for x in filtered['answers']]))][grouping].unique()
    filtered_id_list = filtered_id_list + list(ids)        

In [None]:
fd = MixedFeatureData(
    filtered[filtered["snippet_id"].isin(filtered_id_list)].to_json(),
    dependent_variable,
    continuous_independent_variables,
    binary_independent_variables,
    categorical_independent_variables,
    positive_outcomes,
    negative_outcomes,
    demo_independent_variables,
    tag_independent_variables,
    grouping,
)
eda = ExploratoryAnalysis(
    fd.encoded_features,
    fd.independent_variables,
    fd.dependent_variable,
    fd.question_choices(),
)
logistic_regression = MixedClassificationModel(fd)

## Correlations of Independent Variables
### After viewing these you may want to rethink your independent variable choices

In [None]:
eda.correlation_matrix_plots()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118db4748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bb952e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bbb20b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bc82208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bc8d518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bc98828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bca3978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bcf7b38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bcf7ba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd0e048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd19358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd24668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd2f978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd3ac88>,
      

In [None]:
eda.correlation_matrix()

# Relative Variance

In [None]:
explained_variances = eda.pca_explained_variances()
for i, q_id in enumerate(fd.independent_variables):
    print(q_id, round(explained_variances[i],2))

# Single Value Decomposition
## Data Projected onto a 2D space, and colored by Outcome

In [None]:
eda.plot_outcome_clusters()

# Logistic Regression
## Histogram of Predicted Probability of Positive Outcome, colored by Actual Outcome


In [None]:
logistic_regression.visualize_goodness()

#
## Results Summary

In [None]:
logistic_regression.print_results()

# 
## Simulated Probability of Positive Outcome controlling for all but one variable

In [None]:
for independent_variable in fd.continuous_independent_variables:
    if independent_variable not in fd.binary_independent_variables:
        logistic_regression.simulate_continuous_outcomes(independent_variable)

In [None]:
for independent_variable in fd.independent_variables:
    if independent_variable in fd.binary_independent_variables or\
       independent_variable in fd.dummies:
        logistic_regression.simulate_binary_outcomes(independent_variable)