In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Read file 
def importData(csv):
    data = pd.read_csv(csv)
    return data

#Break down goal setting and team process into: goals, support, communication, work allocation, team role, and enjoyment
#Then, find the magnitude of such difficulties for each team
def getCategoryDifferences(df, teamNumberCol):
    #Subset df to just goal setting and team process quantitative questions
    columnsOfInterest = [teamNumberCol, "SharedGoalTeamAvg", "SharedGoalClassAvg", 
                         "IndvGoalTeamAvg", "IndvGoalClassAvg", "SupportTeamAvg", 
                         "SupportClassAvg", "CommunicateTeamAvg", "CommunicateClassAvg", 
                         "WorkAllocTeamAvg", "WorkAllocClassAvg", "RoleTeamAvg", 
                         "RoleClassAvg", "EnjoyTeamAvg", "EnjoyClassAvg"]
    temp = df[columnsOfInterest]
    
    #Collapse rows so that 1 row = 1 team
    temp = temp.drop_duplicates()
    
    #Calculate all averages (= team's cumulative deviation from class average)
    temp['goalsDiff'] = (temp["SharedGoalTeamAvg"] + temp["IndvGoalTeamAvg"]) - (temp["SharedGoalClassAvg"] + temp["IndvGoalClassAvg"])
    temp['supportDiff'] = temp["SupportTeamAvg"] - temp["SupportClassAvg"]
    temp['communicateDiff'] = temp["CommunicateTeamAvg"] - temp["CommunicateClassAvg"]
    temp['workAllocationDiff'] = temp["WorkAllocTeamAvg"] - temp["WorkAllocClassAvg"]
    temp['teamRoleDiff'] = temp["RoleTeamAvg"] - temp["RoleClassAvg"]
    temp['enjoyDiff'] = temp["EnjoyTeamAvg"] - temp["EnjoyClassAvg"]
    temp['overallDiff'] = round(temp['goalsDiff'] + temp['supportDiff'] + temp['communicateDiff'] + temp['workAllocationDiff'] + temp['teamRoleDiff'] + temp['enjoyDiff'], 2)
    
    tempRanked = temp.sort_values(teamNumberCol, ascending=True).reset_index(drop=True)
    tempRanked = tempRanked[[teamNumberCol, 'overallDiff', 'goalsDiff', 'supportDiff', 'communicateDiff', 'workAllocationDiff', 'teamRoleDiff', 'enjoyDiff']]
    
    return tempRanked

#Add pctImpact placeholder columns based on the list of column names
def addPctColumns(df, pctColumnNamesList):
    for pctCol in pctColumnNamesList:
        df[pctCol] = [''] * len(df)
    return df

#For each team, identify % breakdown of total struggles relative to 6 categories of teaming:
#goal setting, support, communication, work allocation, team roles. and enjoyment
def getTotalBreakdown(df, teamNumberCol, categoryDiffsList, pctColsList):
    allContributions = []
    for team in df[teamNumberCol]:
        teamCategoryTracker = categoryDiffsList.copy()
        
        categoryDiffs = list((df[df[teamNumberCol] == team][categoryDiffsList]).squeeze())
        absOverallDiff = sum(map(abs, categoryDiffs))
        overallDiff = sum(categoryDiffs)
        
        teamContributions = {}

        #round 1: subtract total positive numbers from absOverallDiff
        for category in teamCategoryTracker:
            categoryPctName = category.replace('Diff', '') + 'Pct'
            categoryDiff = float(df[df[teamNumberCol] == team][category])
            
            if categoryDiff >= 0:
                absOverallDiff -= categoryDiff
                teamContributions[categoryPctName] = 0
                #teamCategoryTracker.remove(category)
        
        #round 2: calculate % impact of each remaining category
        for category in teamCategoryTracker:
            categoryPctName = category.replace('Diff', '') + 'Pct'
            absCategoryDiff = abs(df[df[teamNumberCol] == team][category])
            categoryDiff = float(df[df[teamNumberCol] == team][category])
            
            if categoryDiff < 0:
                pctImpact = absCategoryDiff/absOverallDiff
                teamContributions[categoryPctName] = float(pctImpact)
        
        #Add team contributions to all class contributions list
        allContributions.append(teamContributions)
    

    for pctCol in pctColsList:
        #print(index)
        
        pctColVals = []
        for team_dict in allContributions:
            
            
            pctColVals.append(team_dict[pctCol])

        df[pctCol] = pctColVals
    
        
    return df

#Assign each team a percentile (score) based on their overall difference from the class average. A difference of 0 designates a 50 (median)
def getTeamPercentiles(df, teamNumberCol):
    percentiles = []
    for overallDiff in df['overallDiff']:
        if overallDiff <= -5:
            percentiles.append(0)
        elif overallDiff == 0:
            percentiles.append(50)
        elif overallDiff >= 5:
            percentiles.append(100)
        else: 
            percentiles.append(50 + 10*(overallDiff))
    df['percentile'] = percentiles
    return df

#Using impact pct columns, add a column to the dataset that has top category difficulties. Each highlighted category must have an impact of at least the threshold.
def getTopDifficulties(df, teamNumberCol, pctColsList, percentileCol, threshold=.25, includeAboveMedianTeams=True):
    classDifficulties = []
    
    #instantiate new column of top difficulties
    df["topDifficulties"] = [''] * len(df)
    
    teamsToAnalyze = list(df[teamNumberCol])
    for team in teamsToAnalyze:
        teamDifficulties = []
        currTeam = df[df[teamNumberCol] == team]
        #if the team is in bottom 20 percentile, add change threshold to add more topDifficulties
        # TO DO FIND OUT WHY THRESHOLD NOT WORKING
        team_percentile = int(currTeam[percentileCol])
        if team_percentile > 20:
            for pctCol in pctColsList:
                if float(currTeam[pctCol]) > threshold:
                    teamDifficulties.append(pctCol.replace('Pct', ''))
        else:
            #threshold = .1
            for pctCol in pctColsList:
                if float(currTeam[pctCol]) > threshold*.4: #threshold decreased by 75%
                    teamDifficulties.append(pctCol.replace('Pct', ''))
        classDifficulties.append(teamDifficulties)
    df["topDifficulties"] = classDifficulties
    
    return df
#For each difficulty in 'topDifficulties', find students who are contributing most to that team difficulty
def getStudentsWithDifficulties(df, macroanalysisData, fullNameCol, teamNumberCol, topDifficultiesCol):
    
    #Grab just the student names and teams from the macroanalysis data
    subsetMacroanalysis = macroanalysisData[[fullNameCol, teamNumberCol]]
    
    columns = {}
    #Locate relevant columns for splitting out difficulties on student level
    columns['sharedGoalsCols'] = ['Our team is clear about the shared goals for our work together', 'SharedGoalTeamAvg']
    columns['individualGoalsCols'] = ["We each know about one another's individual goals for our work together", 'IndvGoalTeamAvg']
    columns['workAllocationCols'] = ['We fairly distribute work amongst ourselves', 'WorkAllocTeamAvg']
    columns['supportCols'] = ['I feel supported by my teammates in the pursuit of my individual goals for this project', 'SupportTeamAvg']
    columns['rolesCols'] = ['I am clear about the roles I play on my team', "RoleTeamAvg"] 
    columns['initiativeCols'] = ["InitiativeMe", "InitiativeTeammateAvg"]
    columns['communicateComparisonCols'] = ["CommunicateMe", "CommunicateTeammateAvg"]
    columns['communicateOverallCols'] = ["We have good communication amongst ourselves", "CommunicateTeamAvg"]
    columns['expertiseCols'] = ["ExpertiseMe", "ExpertiseTeammateAvg"]
    columns['respectCols'] = ["RespectMe", "RespectTeammateAvg"]
    columns['enjoyCols'] = ["I enjoy working with my teammates", 'EnjoyTeamAvg']
    
    #Calculate student level differences and add new columns to subset df    
    for col in columns:
        newColName = col.replace("Cols", "Diff")

        currTeammate = pd.Series(macroanalysisData[columns[col][0]]).astype(float)
        currTeamAvg = pd.Series(macroanalysisData[columns[col][1]]).astype(float)

        subsetMacroanalysis[newColName] = currTeammate - currTeamAvg
       
     #Now, using the topDifficulties column in the team-level df, locate top struggling students based on the team's difficulty                                   
    studentsWithDifficultiesTeamLevel = []
    
    for team in df[teamNumberCol]:
        currTeamLevelDifficulties = list(df[df[teamNumberCol] == team][topDifficultiesCol])[0]
        currTeamStudentLevel = subsetMacroanalysis[subsetMacroanalysis[teamNumberCol] == team]
        
        teamDifficulties = []
        if len(currTeamLevelDifficulties) != 0:
            for difficulty in currTeamLevelDifficulties:
                
                if difficulty == 'goals':
                    #Split goals by individual and shared
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'sharedGoalsDiff']].sort_values(["sharedGoalsDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["sharedGoalsDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['sharedGoalsDiff'], 2)
                        teamDifficulties.append(difficulty + " (Shared): " + str(stuName) + ', ' + str(stuDiff))
                        
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'individualGoalsDiff']].sort_values(["individualGoalsDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["individualGoalsDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['individualGoalsDiff'], 2)
                        teamDifficulties.append(difficulty + " (Individual): " + str(stuName) + ', ' + str(stuDiff))
                if difficulty == 'support':
                    
                    #split support by: support and respect
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'supportDiff']].sort_values(["supportDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["supportDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['supportDiff'], 2)
                        teamDifficulties.append(difficulty + " (Overall): " + str(stuName) + ', ' + str(stuDiff))
                        
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'respectDiff']].sort_values(["respectDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["respectDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['respectDiff'], 2)
                        teamDifficulties.append(difficulty + " (Respect): " + str(stuName) + ', ' + str(stuDiff))
                if difficulty == 'communicate':
                        
                    #split communicate by: comparison and overall
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'communicateOverallDiff']].sort_values(["communicateOverallDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["communicateOverallDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['communicateOverallDiff'], 2)
                        teamDifficulties.append(difficulty + " (Overall): " + str(stuName) + ', ' + str(stuDiff))
                        
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'communicateComparisonDiff']].sort_values(["communicateComparisonDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["communicateComparisonDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['communicateComparisonDiff'], 2)
                        teamDifficulties.append(difficulty + " (Teammate Comparison): " + str(stuName) + ', ' + str(stuDiff))
                if difficulty == 'workAllocation':
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'workAllocationDiff']].sort_values(["workAllocationDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["workAllocationDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['workAllocationDiff'], 2)
                        teamDifficulties.append(difficulty + "(Overall): " + str(stuName) + ', ' + str(stuDiff))
                    
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'expertiseDiff']].sort_values(["expertiseDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["expertiseDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['expertiseDiff'], 2)
                        teamDifficulties.append(difficulty + "(Contribute Expertise): " + str(stuName) + ', ' + str(stuDiff))
                if difficulty == 'enjoy':
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'enjoyDiff']].sort_values(["enjoyDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["enjoyDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['enjoyDiff'], 2)
                        teamDifficulties.append(difficulty + ": " + str(stuName) + ', ' + str(stuDiff))
                if difficulty == 'teamRole':
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'rolesDiff']].sort_values(["rolesDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["rolesDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['rolesDiff'], 2)
                        teamDifficulties.append(difficulty + "(Overall): " + str(stuName) + ', ' + str(stuDiff))
                    currStudentLevelDifficulty = currTeamStudentLevel[[fullNameCol, 'initiativeDiff']].sort_values(["initiativeDiff"])
                    onlyNegativeSkewStudents = currStudentLevelDifficulty[currStudentLevelDifficulty["initiativeDiff"] < -0.5].reset_index()
                    for stu in range(len(onlyNegativeSkewStudents)):
                        stuName = onlyNegativeSkewStudents.iloc[stu][fullNameCol]
                        stuDiff = round(onlyNegativeSkewStudents.iloc[stu]['initiativeDiff'], 2)
                        teamDifficulties.append(difficulty + "(Initiative): " + str(stuName) + ', ' + str(stuDiff))                        
        
         
                        
            studentsWithDifficultiesTeamLevel.append(teamDifficulties)
        else:
            studentsWithDifficultiesTeamLevel.append([])
    
    #print(studentsWithDifficultiesTeamLevel)
    df['studentsWithDifficulties'] = studentsWithDifficultiesTeamLevel
    return df
                                    
# Add teammates to each team in a new column
def addRosterNames(df, teamNumberCol, rosterCSV, rosterTeamNumberCol, rosterFullNameCol):
    #REQUIRES data type match b/t df[teamNumberCol] and rosterTeamNumberCol 
    roster = importData(rosterCSV)
    
    #ADHOC addition because roster team number is a string, i.e. "Team 1"
    roster[rosterTeamNumberCol] = [float(team.replace("Team", '').strip()) for team in roster[rosterTeamNumberCol]]

    allTeams = []
    for team in df[teamNumberCol]:
        teammate_lst = list(roster[roster[rosterTeamNumberCol] == team][rosterFullNameCol])
        allTeams.append(teammate_lst)
    df["Teammates"] = allTeams
    
    return df

#Same as function above but adds emails as a new column (in order of teammate 1, teammate 2, etc.)
def addRosterEmails(df, teamNumberCol, rosterCSV, rosterTeamNumberCol, rosterEmailCol):
    #REQUIRES data type match b/t df[teamNumberCol] and rosterTeamNumberCol 
    roster = importData(rosterCSV)
    
    #ADHOC addition because roster team number is a string, i.e. "Team 1"
    roster[rosterTeamNumberCol] = [float(team.replace("Team", '').strip()) for team in roster[rosterTeamNumberCol]]

    allEmails = []
    for team in df[teamNumberCol]:
        email_lst = list(roster[roster[rosterTeamNumberCol] == team][rosterEmailCol])
        allEmails.append(email_lst)
    df["Emails"] = allEmails
    
    return df

In [3]:
# A function that takes in cleaned macroanalysis data and returns a .csv file of teams, their overall scores, 
# and a breakdown of their teaming difficulties
def getFacultyInsights(macroanalysis_data, teamNumberCol):
    #Variables needed later
    pctCols = ['goalsPct', 'supportPct', 'communicatePct', 'workAllocationPct', 'teamRolePct', 'enjoyPct']
    diffCols = ['goalsDiff', 'supportDiff', 'communicateDiff', 'workAllocationDiff', 'teamRoleDiff', 'enjoyDiff']
    
    #Import data (csv)
    raw = importData(macroanalysis_data).replace("No Response", np.nan)

    #Calculate team average difference from class average relative to 6 key categories: 
    #goal setting, support, communication, work allocation, team roles. and enjoyment
    teamDifferences = getCategoryDifferences(raw, teamNumberCol)
    
    #For each team, identify % of difficulties attributed to each teaming category
    teamsRankedWithBreakdown = addPctColumns(teamDifferences, pctCols)
    teamsRankedWithBreakdown = getTotalBreakdown(teamsRankedWithBreakdown, teamNumberCol, diffCols, pctCols)
    teamsRankedWithBreakdown = round(teamsRankedWithBreakdown, 2)
    
    #Add overall score based on 'overallDiff'
    teamsRanked = getTeamPercentiles(teamsRankedWithBreakdown, teamNumberCol)
    
    #Add Top Difficulties for each team
    teamsRanked = getTopDifficulties(teamsRanked, teamNumberCol, pctCols, 'percentile', .40, True)
    
    #For each difficulty, identify students contributing to said difficulty
    teamsRanked = getStudentsWithDifficulties(teamsRanked, raw, 'FullName', teamNumberCol, 'topDifficulties')
    
    #Add teammate names from roster
    final = addRosterNames(teamsRanked, teamNumberCol, 'FINAL_E29_ROSTER.CSV', 'TeamNumber', 'FullName')
    
    #Add teammate emails from roster
    final = addRosterEmails(teamsRanked, teamNumberCol, 'FINAL_E29_ROSTER.CSV', 'TeamNumber', 'Email')
    
    return final
    

In [4]:
#raw = importData('E29_CHECKIN1_MACROANALYSIS_V1.csv').replace("No Response", np.nan)

In [5]:
getFacultyInsights('E29_CHECKIN1_MACROANALYSIS_V2.csv', 'TeamNumber').to_csv("E29_CHECKIN01_Faculty_Insights_v2.csv")