In [1]:
import os
from typing import (
    Callable, Any, List, Dict, Type, Tuple, Optional
)
import pandas as pd
import numpy as np
from processing import clean_qualtrics_data

In [2]:
raw_path = 'E29_CHECKIN03_RAW_TEXT.csv'
roster_path = 'E29_Qualtrics_Roster_EOS.csv'
question_dict_path = 'E29_QUESTION_DICTIONARY.csv'

In [3]:
full_cleaned = clean_qualtrics_data(raw_path, roster_path, question_dict_path)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_cleaned[col].fillna('No Response', inplace=True)


In [4]:
full_cleaned

Unnamed: 0,Email,TeamNumber,TeammateNumber,RecipientFirstName,RecipientEmail,33,39.1,39.2,39.3,2.1,...,19.2,19.3,19.4,19.5,19.6,55,51,52,53,54
0,allanzhong@berkeley.edu,Team 1,1,Allan,allanzhong@berkeley.edu,Yes,4,3.5,3.5,2.0,...,2.0,2.0,2.0,Agree,Agree,No Response,No Response,No Response,No Response,No Response
1,eclopez19@berkeley.edu,Team 1,2,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
2,jerrick@berkeley.edu,Team 1,3,Jerrick,jerrick@berkeley.edu,Yes,4.5,4.5,4.5,3.0,...,3.0,3.0,3.0,No Response,No Response,5,How to work in a team better,How to do more manufacturing,How to combine technical and interpersonal skills,I could do more technical or manufacturing
3,simrankk@berkeley.edu,Team 1,4,Simran,simrankk@berkeley.edu,Yes,3.5,4,3,2.0,...,-1.0,0.0,1.0,No Response,No Response,No Response,technical skills,more ways to improve,final result,had more time
4,miroygarrett@berkeley.edu,Team 10,1,Miro,miroygarrett@berkeley.edu,Yes,4,5,4,2.0,...,3.0,3.0,3.0,No Response,No Response,5,about applications of GD&T and about my teammates,if we will consider continuing to pursue this ...,how well received our product and prototype were,we came up with the idea for the product earlier.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,siddhant.vasudevan@berkeley.edu,Team 8,4,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
129,allan.sram16@berkeley.edu,Team 9,1,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
130,arnavp45@berkeley.edu,Team 9,2,Arnav,arnavp45@berkeley.edu,Yes,4,4,4,2.0,...,3.0,3.0,3.0,No Response,No Response,No Response,about the effort it takes do develop a product...,how this process will be different in the indu...,how easy it was to work when everybody was on ...,we did not have as many tedious assignments
131,owenlachs@berkeley.edu,Team 9,3,Owen,owenlachs@berkeley.edu,Yes,4,4.5,4,2.0,...,3.0,3.0,2.0,No Response,No Response,5,my natural role on a team of engineers.,How I would perform in a different role on the...,the fact that there was no struggle for power ...,I got more time to develop the project to make...


In [5]:
roster = pd.read_csv("E29_Qualtrics_Roster_EOS.csv", index_col=0)
dictionary = pd.read_csv("E29_QUESTION_DICTIONARY.csv", index_col=0)
dictionary.index = dictionary.index.astype(str)

In [6]:
quant = dictionary.query('`type` == "quantitative"').index.to_list()
numerical_data = full_cleaned[['TeamNumber', 'TeammateNumber']].join(full_cleaned[quant])
numerical_data.index = full_cleaned.Email
# convert TeamNumber to int
for df in (roster, full_cleaned, numerical_data):
    df['TeamNumber'] = df['TeamNumber'].str.replace('Team ', '', regex=False).astype(float)

roster['TeammateNumber'] = roster['TeammateNumber'].astype(float)

In [64]:
def _keep_numeric_cols(df, question):
    relevant_columns = df.loc[:, df.columns.str.startswith(question)]
    # get only numeric columns
    keep_cols = []
    for idx, row in pd.DataFrame(relevant_columns.dtypes).iterrows():
        if row[0] in ['int', 'float']:
            keep_cols.append(idx)
            
    relevant_columns = df[keep_cols]
    
    return relevant_columns

def get_team_average(df, question):
    # Gets average on specified question for each team in df
    return df[['TeamNumber', question]].groupby('TeamNumber').mean()

def get_teammates_average(df, question):
    def apply_helper(row, apply_df):
        team_number, teammate_number = row['TeamNumber'], int(row['TeammateNumber'])
        apply_df = apply_df.loc[
            apply_df['TeamNumber'] == team_number, 
            apply_df.columns != f'{question}.{teammate_number}'
        ]
        apply_df = _keep_numeric_cols(apply_df, question)
        return np.nanmean(apply_df)

    return df.apply(apply_helper, axis=1, apply_df=df)
    
def get_teammates_std(df: pd.DataFrame, question: str):
    def apply_helper(row, apply_df):
        team_number, teammate_number = row['TeamNumber'], int(row['TeammateNumber'])
        apply_df = apply_df.loc[
            apply_df['TeamNumber'] == team_number, 
            apply_df.columns != f'{question}.{teammate_number}'
        ]
        apply_df = _keep_numeric_cols(apply_df, question)
        return np.nanstd(apply_df)

    return df.apply(apply_helper, axis=1, df=df)

def get_response_by_teammate_number(df, teammate_number, question):
    return df[df['TeammateNumber'] == teammate_number][['TeamNumber', question]].set_index('TeamNumber')

def get_my_response(df: pd.DataFrame, question: str) -> pd.Series:
    def apply_helper(row):
        teammate_number = int(row['TeammateNumber'])
        return row[f'{question}.{teammate_number}']
    return df.apply(apply_helper, axis=1)

def get_class_average(df: pd.DataFrame, question: str) -> float:
    # Returns the class average for QUESTION
    return float(np.mean(df[question]))

def get_class_average_all_teammates(df: pd.DataFrame, question: str) -> float:
    return np.nanmean(_keep_numeric_cols(df, question))

def get_class_stdev(df: pd.DataFrame, question: str) -> float:
    # Returns the class std dev for QUESTION
    return float(np.std(df[question]))

def get_class_stdev_all_teammates(df: pd.DataFrame, question: str) -> float:
    return np.nanstd(_keep_numeric_cols(df, question))

IndentationError: expected an indented block after function definition on line 29 (3883893641.py, line 31)

In [56]:
full_cleaned.loc[:, full_cleaned.columns != '16.1']

Unnamed: 0,Email,TeamNumber,TeammateNumber,RecipientFirstName,RecipientEmail,33,39.1,39.2,39.3,2.1,...,19.2,19.3,19.4,19.5,19.6,55,51,52,53,54
0,allanzhong@berkeley.edu,1.0,1,Allan,allanzhong@berkeley.edu,Yes,4,3.5,3.5,2.0,...,2.0,2.0,2.0,Agree,Agree,No Response,No Response,No Response,No Response,No Response
1,eclopez19@berkeley.edu,1.0,2,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
2,jerrick@berkeley.edu,1.0,3,Jerrick,jerrick@berkeley.edu,Yes,4.5,4.5,4.5,3.0,...,3.0,3.0,3.0,No Response,No Response,5,How to work in a team better,How to do more manufacturing,How to combine technical and interpersonal skills,I could do more technical or manufacturing
3,simrankk@berkeley.edu,1.0,4,Simran,simrankk@berkeley.edu,Yes,3.5,4,3,2.0,...,-1.0,0.0,1.0,No Response,No Response,No Response,technical skills,more ways to improve,final result,had more time
4,miroygarrett@berkeley.edu,10.0,1,Miro,miroygarrett@berkeley.edu,Yes,4,5,4,2.0,...,3.0,3.0,3.0,No Response,No Response,5,about applications of GD&T and about my teammates,if we will consider continuing to pursue this ...,how well received our product and prototype were,we came up with the idea for the product earlier.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,siddhant.vasudevan@berkeley.edu,8.0,4,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
129,allan.sram16@berkeley.edu,9.0,1,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
130,arnavp45@berkeley.edu,9.0,2,Arnav,arnavp45@berkeley.edu,Yes,4,4,4,2.0,...,3.0,3.0,3.0,No Response,No Response,No Response,about the effort it takes do develop a product...,how this process will be different in the indu...,how easy it was to work when everybody was on ...,we did not have as many tedious assignments
131,owenlachs@berkeley.edu,9.0,3,Owen,owenlachs@berkeley.edu,Yes,4,4.5,4,2.0,...,3.0,3.0,2.0,No Response,No Response,5,my natural role on a team of engineers.,How I would perform in a different role on the...,the fact that there was no struggle for power ...,I got more time to develop the project to make...


In [57]:
analysis_v2 = roster[["FullName", "Email", "TeamNumber", "TeammateNumber"]].copy()

In [58]:
class ColumnBuilder:
    def __init__(
        self,
        col_prefix: str,
        col_suffix: str,
        pd_function: Callable[..., pd.DataFrame | pd.Series | int | float],
        pd_function_args: List[Any]
    ):
        self.col_prefix = col_prefix
        self.col_suffix = col_suffix
        self.pd_function = pd_function
        self.pd_function_args = pd_function_args

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        new_col_name = f'{self.col_prefix}{self.col_suffix}'
        pd_result = self.pd_function(df, *self.pd_function_args)
        # if unequal length of series, need to perform merge on index of result
        if type(pd_result) == pd.DataFrame and len(pd_result != len(df)):
            index_name = pd_result.index.name
            pd_result.columns = pd.Index([new_col_name])

            return pd.merge(df, pd_result, how='left', on=index_name)

        # singular value will be broadcasted on whole column
        # or same length of series and dataframe
        else:
            return df.assign(**{
                new_col_name: self.pd_function(df, *self.pd_function_args)
            })

class Identity(ColumnBuilder):
    def __init__(self, col_prefix: str, col_suffix: str, question: str):
        super().__init__(col_prefix, col_suffix, lambda df, q: df[q], [question])

class ClassAvg(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'ClassAvg', get_class_average, [question])

class ClassStDev(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'ClassStDev', get_class_stdev, [question])

class TeammateAvg(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'TeammateAvg', get_teammates_average, [question])

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        result = super().__call__(df)
        return result

class TeamAvg(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'TeamAvg', get_team_average, [question])

class AcrossTeammates(ColumnBuilder):
    # only use this for class avg across all teammate responses
    pass

class PerMemberFunc(ColumnBuilder):
    # no need to override __init__

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        team_size = max(df['TeammateNumber'].unique())

        for i in range(1, team_size+1):
            df = ColumnBuilder(
                self.col_prefix,
                str(i),
                get_response_by_teammate_number,
                [i] + self.pd_function_args
            )(df)

        return df


In [59]:
suffix_to_columnbuilder: Dict[str, Type[ColumnBuilder]] = {
    'TeamAvg': TeamAvg,
    'TeammateAvg': TeammateAvg,
    '': Identity,
    'Me': Identity,
    'ClassAvg': ClassAvg,
    'ClassStDev': ClassStDev,
    'PerMember': PerMemberFunc,
}

col_to_prefix: Dict[str, str] = {
    '2.1': 'SharedGoal',
    '2.2': 'IndvGoal',
    '2.3': 'Support',
    '4.1': 'Communicate',
    '4.2': 'WorkAlloc',
    '4.3': 'Role',
    '4.4': 'Enjoy',
    '6': 'DidWell',
    '7': 'DoDiff',
    '14': 'Contribute',
    '15': 'Improve',
    '16': 'Initiative',
    '17': 'Communicate',
    '18': 'Expertise',
    '19': 'Respect',
}

In [60]:
def column_builder_pipe(
        df: pd.DataFrame,
        column_builder_params: List[Tuple]
) -> pd.DataFrame:
    for (question, suffix, func, func_args) in column_builder_params:
        prefix = col_to_prefix[question]
        if suffix in suffix_to_columnbuilder:
            builder_type = suffix_to_columnbuilder[suffix]
            
            if builder_type == PerMemberFunc:
                cb = builder_type(prefix, suffix, func, func_args)
            elif func is not None:
                cb = ColumnBuilder(prefix, suffix, func, func_args)
            else:
                if builder_type == Identity:
                    cb = builder_type(prefix, suffix, question)
                elif builder_type == PerMemberFunc:
                    cb = builder_type(prefix, suffix, func, func_args)
                else:
                    cb = builder_type(prefix, question)

        else:
            cb = ColumnBuilder(prefix, suffix, func, func_args)

        df = df.pipe(cb)

    return df

In [61]:
full_cleaned.columns

Index(['Email', 'TeamNumber', 'TeammateNumber', 'RecipientFirstName',
       'RecipientEmail', '33', '39.1', '39.2', '39.3', '2.1', '2.2', '2.3',
       '3', '4.1', '4.2', '4.3', '4.4', '5.1', '5.2', '5.3', '5.4', '5.5',
       '5.7', '6', '7', '38.1', '38.2', '38.3', '14.1', '14.2', '14.3', '14.4',
       '14.5', '14.6', '15.1', '15.2', '15.3', '15.4', '15.5', '15.6', '16.1',
       '16.2', '16.3', '16.4', '16.5', '16.6', '17.1', '17.2', '17.3', '17.4',
       '17.5', '17.6', '18.1', '18.2', '18.3', '18.4', '18.5', '18.6', '19.1',
       '19.2', '19.3', '19.4', '19.5', '19.6', '55', '51', '52', '53', '54'],
      dtype='object')

In [88]:
sample_params = [
    ('2.1', 'TeamAvg', None, None),
    ('2.1', 'ClassAvg', None, None),
    ('2.1', 'ClassStDev', None, None),
    ('2.2', 'TeamAvg', None, None),
    ('2.3', 'TeamAvg', None, None),
    ('4.1', 'TeamAvg', None, None),
    ('4.1', 'ClassAvg', None, None),
    ('4.1', 'ClassStDev', None, None),
    ('4.2', 'TeamAvg', None, None),
    ('4.2', 'ClassAvg', None, None),
    ('4.2', 'ClassStDev', None, None),
    ('4.3', 'TeamAvg', None, None),
    ('4.3', 'ClassAvg', None, None),
    ('4.3', 'ClassStDev', None, None),
    ('4.4', 'TeamAvg', None, None),
    ('4.4', 'ClassAvg', None, None),
    ('4.4', 'ClassStDev', None, None),
    ('6', 'Me', None, None),
    ('6', 'PerMember', None, ['6']),
    ('7', 'PerMember', None, ['7']),
    ('16', 'Me', get_my_response, ['16']),
    ('16', 'TeammateAvg', None, None),
    ('16', 'ClassAvg', get_class_average_all_teammates, ['16']),
    ('16', 'ClassStDev', get_class_stdev_all_teammates, ['16']),
    ('17', 'Me', get_my_response, ['17']),
    ('17', 'TeammateAvg', None, None),
    ('17', 'ClassAvg', get_class_average_all_teammates, ['17']),
    ('17', 'ClassStDev', get_class_stdev_all_teammates, ['17']),
]
result = column_builder_pipe(full_cleaned, sample_params)

In [102]:
cols_to_remove = ~full_cleaned.columns.str.contains(r'[0-9]', regex=True)
padded = list(cols_to_remove) + [np.True_] * (len(result.columns) - len(cols_to_remove))
result.loc[:, np.array(padded)]

Unnamed: 0,Email,TeamNumber,TeammateNumber,RecipientFirstName,RecipientEmail,SharedGoalTeamAvg,SharedGoalClassAvg,SharedGoalClassStDev,IndvGoalTeamAvg,SupportTeamAvg,...,DoDiff1,DoDiff2,DoDiff3,DoDiff4,InitiativeMe,InitiativeTeammateAvg,InitiativeClassAvg,InitiativeClassStDev,CommunicateMe,CommunicateTeammateAvg
0,allanzhong@berkeley.edu,1.0,1,Allan,allanzhong@berkeley.edu,2.333333,2.408696,0.853592,2.333333,2.000000,...,No Response,No Response,Work on deadlines sooner,Work Allocation,2.0,1.888889,2.362812,0.875401,2.0,2.000000
1,eclopez19@berkeley.edu,1.0,2,No Response,No Response,2.333333,2.408696,0.853592,2.333333,2.000000,...,No Response,No Response,Work on deadlines sooner,Work Allocation,,2.333333,2.362812,0.875401,,2.000000
2,jerrick@berkeley.edu,1.0,3,Jerrick,jerrick@berkeley.edu,2.333333,2.408696,0.853592,2.333333,2.000000,...,No Response,No Response,Work on deadlines sooner,Work Allocation,3.0,2.111111,2.362812,0.875401,3.0,2.000000
3,simrankk@berkeley.edu,1.0,4,Simran,simrankk@berkeley.edu,2.333333,2.408696,0.853592,2.333333,2.000000,...,No Response,No Response,Work on deadlines sooner,Work Allocation,2.0,2.000000,2.362812,0.875401,1.0,2.000000
4,miroygarrett@berkeley.edu,10.0,1,Miro,miroygarrett@berkeley.edu,2.500000,2.408696,0.853592,2.250000,3.000000,...,Deadline and work management,We need to finish the class strong by making s...,We could potentially meet in person more often...,We could have possibly started making our prot...,2.0,2.500000,2.362812,0.875401,2.0,2.416667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,siddhant.vasudevan@berkeley.edu,8.0,4,No Response,No Response,3.000000,2.408696,0.853592,2.666667,2.666667,...,Delegating work fairly amongst our group.,Something our team could do better is plan to ...,Our team could be more proactive with hitting ...,No Response,,3.000000,2.362812,0.875401,,2.777778
129,allan.sram16@berkeley.edu,9.0,1,No Response,No Response,1.666667,2.408696,0.853592,1.666667,2.333333,...,No Response,I would like some members on the team to respo...,Be much more responsive to communication,Communicate needs,,2.444444,2.362812,0.875401,,2.555556
130,arnavp45@berkeley.edu,9.0,2,Arnav,arnavp45@berkeley.edu,1.666667,2.408696,0.853592,1.666667,2.333333,...,No Response,I would like some members on the team to respo...,Be much more responsive to communication,Communicate needs,3.0,2.333333,2.362812,0.875401,3.0,2.444444
131,owenlachs@berkeley.edu,9.0,3,Owen,owenlachs@berkeley.edu,1.666667,2.408696,0.853592,1.666667,2.333333,...,No Response,I would like some members on the team to respo...,Be much more responsive to communication,Communicate needs,3.0,2.222222,2.362812,0.875401,3.0,2.222222
