In [2]:
import os
from typing import (
    Callable, Any, List, Dict, Type, Tuple, Optional
)
import pandas as pd
import numpy as np
from processing import cleanQualtricsData

In [3]:
raw_path = 'E29_CHECKIN03_RAW_TEXT.csv'
roster_path = 'E29_Qualtrics_Roster_EOS.csv'
question_dict_path = 'E29_QUESTION_DICTIONARY.csv'

In [4]:
full_cleaned = cleanQualtricsData(raw_path, roster_path, question_dict_path)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_cleaned[col].fillna('No Response', inplace=True)


In [5]:
full_cleaned

Unnamed: 0,Email,TeamNumber,TeammateNumber,RecipientFirstName,RecipientEmail,33,39.1,39.2,39.3,2.1,...,19.2,19.3,19.4,19.5,19.6,55,51,52,53,54
0,allanzhong@berkeley.edu,Team 1,1,Allan,allanzhong@berkeley.edu,Yes,4,3.5,3.5,2.0,...,2.0,2.0,2.0,Agree,Agree,No Response,No Response,No Response,No Response,No Response
1,eclopez19@berkeley.edu,Team 1,2,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
2,jerrick@berkeley.edu,Team 1,3,Jerrick,jerrick@berkeley.edu,Yes,4.5,4.5,4.5,3.0,...,3.0,3.0,3.0,No Response,No Response,5,How to work in a team better,How to do more manufacturing,How to combine technical and interpersonal skills,I could do more technical or manufacturing
3,simrankk@berkeley.edu,Team 1,4,Simran,simrankk@berkeley.edu,Yes,3.5,4,3,2.0,...,-1.0,0.0,1.0,No Response,No Response,No Response,technical skills,more ways to improve,final result,had more time
4,miroygarrett@berkeley.edu,Team 10,1,Miro,miroygarrett@berkeley.edu,Yes,4,5,4,2.0,...,3.0,3.0,3.0,No Response,No Response,5,about applications of GD&T and about my teammates,if we will consider continuing to pursue this ...,how well received our product and prototype were,we came up with the idea for the product earlier.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,siddhant.vasudevan@berkeley.edu,Team 8,4,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
129,allan.sram16@berkeley.edu,Team 9,1,No Response,No Response,No Response,No Response,No Response,No Response,,...,,,,No Response,No Response,No Response,No Response,No Response,No Response,No Response
130,arnavp45@berkeley.edu,Team 9,2,Arnav,arnavp45@berkeley.edu,Yes,4,4,4,2.0,...,3.0,3.0,3.0,No Response,No Response,No Response,about the effort it takes do develop a product...,how this process will be different in the indu...,how easy it was to work when everybody was on ...,we did not have as many tedious assignments
131,owenlachs@berkeley.edu,Team 9,3,Owen,owenlachs@berkeley.edu,Yes,4,4.5,4,2.0,...,3.0,3.0,2.0,No Response,No Response,5,my natural role on a team of engineers.,How I would perform in a different role on the...,the fact that there was no struggle for power ...,I got more time to develop the project to make...


In [41]:
full_cleaned[['19.1', '19.2', '19.3', '19.4']]

Unnamed: 0,19.1,19.2,19.3,19.4
0,2.0,2.0,2.0,2.0
1,,,,
2,3.0,3.0,3.0,3.0
3,-1.0,-1.0,0.0,1.0
4,1.0,3.0,3.0,3.0
...,...,...,...,...
128,,,,
129,,,,
130,3.0,3.0,3.0,3.0
131,3.0,3.0,3.0,2.0


In [6]:
roster = pd.read_csv("E29_Qualtrics_Roster_EOS.csv", index_col=0)
dictionary = pd.read_csv("E29_QUESTION_DICTIONARY.csv", index_col=0)
dictionary.index = dictionary.index.astype(str)

In [7]:
quant = dictionary.query('`type` == "quantitative"').index.to_list()
numerical_data = full_cleaned[['TeamNumber', 'TeammateNumber']].join(full_cleaned[quant])
numerical_data.index = full_cleaned.Email
# convert TeamNumber to int
for df in (roster, full_cleaned, numerical_data):
    df['TeamNumber'] = df['TeamNumber'].str.replace('Team ', '', regex=False).astype(float)

roster['TeammateNumber'] = roster['TeammateNumber'].astype(float)

In [28]:
def get_team_average(df, question):
    # Gets average on specified question for each team in df
    print(df.columns)
    print(question)
    return df[['TeamNumber', question]].groupby('TeamNumber').mean()

def get_teammates_average(df, question):
    def apply_helper(row, df):
        team_number, teammate_number = row['TeamNumber'], row['TeammateNumber']
        return np.nanmean(df.query("TeamNumber == @team_number & TeammateNumber != @teammate_number")[question])

    return df.apply(apply_helper, axis=1, df=df)

def get_teammates_std(df, question):
    def apply_helper(row, df):
        team_number, teammate_number = row['TeamNumber'], row['TeammateNumber']
        return np.nanstd(df.query("TeamNumber == @team_number & TeammateNumber != @teammate_number")[question])

    return df.apply(apply_helper, axis=1, df=df)

def get_response_by_teammate_number(df, teammate_number, question):
    return df[df['TeammateNumber'] == teammate_number][['TeamNumber', question]]

def get_class_average(df: pd.DataFrame, question: str) -> float:
    # Returns the class average for QUESTION
    return float(np.mean(df[question]))

def get_class_average_all_teammates(df: pd.DataFrame, question: str) -> float:
    return float(np.mean(df[df.columns.str.startswith(question)].to_numpy().flatten()))

def get_class_stdev(df: pd.DataFrame, question: str) -> float:
    # Returns the class std dev for QUESTION
    return float(np.std(df[question]))

def get_class_stdev_all_teammates(df: pd.DataFrame, question: str) -> float:
    return float(np.std(df[df.columns.str.startswith(question)].to_numpy().flatten()))

In [9]:
analysis_v2 = roster[["FullName", "Email", "TeamNumber", "TeammateNumber"]].copy()

In [15]:
class ColumnBuilder:
    def __init__(
        self,
        col_prefix: str,
        col_suffix: str,
        pd_function: Callable[..., pd.DataFrame | pd.Series | int | float],
        pd_function_args: List[Any]
    ):
        self.col_prefix = col_prefix
        self.col_suffix = col_suffix
        self.pd_function = pd_function
        self.pd_function_args = pd_function_args

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        new_col_name = f'{self.col_prefix}{self.col_suffix}'
        pd_result = self.pd_function(df, *self.pd_function_args)
        # if unequal length of series, need to perform merge on index of result
        if type(pd_result) == pd.DataFrame and len(pd_result != len(df)):
            index_name = pd_result.index.name
            pd_result.columns = pd.Index([new_col_name])

            return pd.merge(df, pd_result, how='left', on=index_name)

        # singular value will be broadcasted on whole column
        # or same length of series and dataframe
        else:
            return df.assign(**{
                new_col_name: self.pd_function(df, *self.pd_function_args)
            })

class Identity(ColumnBuilder):
    def __init__(self, col_prefix: str, col_suffix: str, question: str):
        super().__init__(col_prefix, col_suffix, lambda df, q: df[q], [question])

class ClassAvg(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'ClassAvg', get_class_average, [question])

class ClassStDev(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'ClassStDev', get_class_stdev, [question])

class TeammateAvg(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'TeammateAvg', get_teammates_average, [question])

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        self.pd_function_args.append(df)
        result = super().__call__(df)
        self.pd_function_args.pop()
        return result

class TeamAvg(ColumnBuilder):
    def __init__(self, col_prefix: str, question: str):
        super().__init__(col_prefix, 'TeamAvg', get_team_average, [question])

class AcrossTeammates(ColumnBuilder):
    # only use this for class avg across teammate responses

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        columns_to_use = df.columns[df.columns.str.startswith(self.col_prefix)]

        df = self.pd_function(df, )

class PerMemberFunc(ColumnBuilder):
    # no need to override __init__

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        team_size = max(df['TeammateNumber'].unique())

        for i in range(1, team_size+1):
            df = ColumnBuilder(
                self.col_prefix,
                str(i),
                get_response_by_teammate_number,
                [i] + self.pd_function_args
            )(df)

        return df


In [17]:
suffix_to_columnbuilder: Dict[str, Type[ColumnBuilder]] = {
    'TeamAvg': TeamAvg,
    'TeammateAvg': TeammateAvg,
    '': Identity,
    'Me': Identity,
    'ClassAvg': ClassAvg,
    'ClassStDev': ClassStDev,
    'PerMember': PerMemberFunc
}

col_to_prefix: Dict[str, str] = {
    '2.1': 'SharedGoal',
    '2.2': 'IndvGoal',
    '2.3': 'Support'
}

In [32]:
def column_builder_pipe(
        df: pd.DataFrame,
        column_builder_params: List[Tuple]
) -> pd.DataFrame:
    for (question, suffix, func, func_args) in column_builder_params:
        cb = None
        if suffix in suffix_to_columnbuilder:
            prefix = col_to_prefix[question]
            builder_type = suffix_to_columnbuilder[suffix]


            if builder_type == Identity:
                cb = builder_type(prefix, suffix, question)
            elif builder_type == PerMemberFunc:
                cb = builder_type(prefix, suffix, func, func_args)
            else:
                cb = builder_type(prefix, question)

        else:
            cb = ColumnBuilder(prefix, suffix, func, func_args)

        print(cb.pd_function_args)
        df = df.pipe(cb)

    return df

In [33]:
sample_params = [
    ('2.1', 'TeamAvg', None, None),
    ('2.1', 'ClassAvg', None, None),
    ('2.1', 'ClassStDev', None, None),
]
column_builder_pipe(numerical_data, sample_params)

['2.1']
Index(['TeamNumber', 'TeammateNumber', '2.1', '2.2', '2.3', '4.1', '4.2',
       '4.3', '4.4', '5.1', '5.2', '5.3', '5.4', '16.1', '16.2', '16.3',
       '16.4', '17.1', '17.2', '17.3', '17.4', '18.1', '18.2', '18.3', '18.4',
       '19.1', '19.2', '19.3', '19.4', '38.1', '38.2', '38.3'],
      dtype='object')
2.1
['2.1']
['2.1']


Unnamed: 0,TeamNumber,TeammateNumber,2.1,2.2,2.3,4.1,4.2,4.3,4.4,5.1,...,19.1,19.2,19.3,19.4,38.1,38.2,38.3,SharedGoalTeamAvg,SharedGoalClassAvg,SharedGoalClassStDev
0,1.0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,27.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.333333,2.408696,0.853592
1,1.0,2,,,,,,,,,...,,,,,,,,2.333333,2.408696,0.853592
2,1.0,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,25.0,...,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.333333,2.408696,0.853592
3,1.0,4,2.0,2.0,1.0,2.0,1.0,1.0,0.0,40.0,...,-1.0,-1.0,0.0,1.0,0.0,2.0,1.0,2.333333,2.408696,0.853592
4,10.0,1,2.0,1.0,3.0,2.0,1.0,2.0,3.0,25.0,...,1.0,3.0,3.0,3.0,3.0,3.0,2.0,2.500000,2.408696,0.853592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,8.0,4,,,,,,,,,...,,,,,,,,3.000000,2.408696,0.853592
129,9.0,1,,,,,,,,,...,,,,,,,,1.666667,2.408696,0.853592
130,9.0,2,2.0,2.0,2.0,2.0,3.0,2.0,2.0,20.0,...,3.0,3.0,3.0,3.0,1.0,1.0,1.0,1.666667,2.408696,0.853592
131,9.0,3,2.0,1.0,3.0,1.0,2.0,3.0,2.0,22.0,...,3.0,3.0,3.0,2.0,2.0,3.0,2.0,1.666667,2.408696,0.853592


In [None]:
# need to write some tests for more general stuff