# Start Here

December 16, 2024

This notebook is my starting point to loading, describing, and manipulating the raw data.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from src.respondent import Respondent
from src.analysis_orchestrator import AnalysisOrchestrator

import os
from IPython import get_ipython

current_folder = os.path.basename(os.getcwd())
if current_folder == 'notebooks':
    root_path = os.path.dirname(os.path.abspath(''))
    os.chdir(root_path)

print(f'Directory set to: {os.getcwd()}')

# Automatically reload all modules every time a line of code is executed
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


Directory set to: /Users/aweng/code/battery-talent-census
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load

In [212]:
df_gsh['Token'].unique()

array(['fkqh2mecyeh4ille74ccdfkqh2d4hld5',
       'tv843y13jvcl3xtv843y1axjxswa7sv9',
       'rze601cdmwu0r2crze6086e9mnxhr45d', ...,
       'rzqhmr9v6qtkeyn9yrbrzqhmrvesv26e',
       '1zx68vutnhfo9erfrl1zx68vijd4jg3g',
       'xgiqw1z6r37pu305hiipxgiqw11r00jc'], shape=(1008,), dtype=object)

In [178]:
df_gsh = pd.read_csv('data/talent_census_data_20241216_gsheet_export.csv')
df_typ = pd.read_csv('data/talent_census_data_20241216_typeform_export.csv')

The two different data formats differ considerably in the number of columns, largely owing to how they treat multi-select responses differently

In [179]:
print(f'Size of Google Sheet export: {df_gsh.shape}')
print(f'Size of Typeform export: {df_typ.shape}')

Size of Google Sheet export: (1008, 89)
Size of Typeform export: (1008, 256)


In [210]:
resp = Respondent('xgiqw1z6r37pu305hiipxgiqw11r00jc')

resp.set_properties_from_google_sheet(df_gsh)
resp.set_properties_from_typeform(df_typ)

In [211]:
resp.metadata

{'submit_time': Timestamp('2024-12-16 16:05:31'), 'duration_mins': 17.95}

In [181]:
resp.census['skills_value_chain']

['Refining', 'Component/precursor production', 'Equipment manufacturing']

In [169]:
resp.census

{'sentiment': {'keys': ["I feel good about what I'm working on",
   'I feel good about my career path',
   'I feel good about my work-life balance',
   'I feel valued by those around me',
   'I see opportunities for career growth'],
  'values': array([5., 5., 4., 5., 5.])},
 'skills_demand': ['Chemistry', 'process knowledge', 'experience'],
 'skills_value_chain': ['Refining',
  'Component/precursor production',
  'Equipment manufacturing'],
 'education': 'Doctorate',
 'study': 'Chemistry',
 'country': 'Sweden',
 'zip': nan,
 'income': np.float64(nan),
 'hours_worked': np.float64(40.0),
 'age': np.float64(64.0),
 'ethnicity': ['White'],
 'gender': 'Male',
 'citizenship': 'Citizen (native-born)',
 'military': 'Yes',
 'employment': "I'm working professionally (e.g., at a company, national lab)",
 'to_complete_industry_questions': True,
 'to_complete_student_questions': nan,
 'to_complete_unemployed_questions': nan,
 'why_leave': nan}

In [209]:
df_typ

Unnamed: 0,#,I feel good about what I'm working on,I feel good about my career path,I feel good about my work-life balance,I feel valued by those around me,I see opportunities for career growth,"In your opinion, what are the top three skills most in demand in the battery industry?",Mining,Refining,Component/precursor production,...,"During your previous internship, were there skills you wish you had learned but didn't? If yes, what were they?","During your previous internship, were there skills that you felt unprepared for? If yes, what were they?","During your previous internship, what was your hourly pay?","During your previous internship, how many hours per week did you work, on average?",Response Type,Start Date (UTC),Stage Date (UTC),Submit Date (UTC),Network ID,Tags
0,1zx68vutnhfo9erfrl1zx68vijd4jg3g,4.0,3.0,4.0,3.0,3.0,"Process Engineering, Material Science, Program...",,,Component/precursor production,...,,,,,completed,2024-12-16 16:00:01,,2024-12-16 16:02:03,5fd3ef73f5,
1,xgiqw1z6r37pu305hiipxgiqw11r00jc,5.0,5.0,4.0,5.0,5.0,"Chemistry, process knowledge, experience",,Refining,Component/precursor production,...,,,,,completed,2024-12-16 15:47:34,,2024-12-16 16:05:31,3289645d27,
2,rzqhmr9v6qtkeyn9yrbrzqhmrvesv26e,4.0,4.0,4.0,4.0,4.0,"Technical Understanding, Software skills, Chem...",,,,...,,,,,completed,2024-12-16 15:32:52,,2024-12-16 15:40:44,493ae9c939,
3,4l3byv02h9r7dwfzvpty4l3byv02821v,4.0,4.0,4.0,3.0,2.0,Cell level performance and analytics; supply c...,,,,...,,,,,completed,2024-12-16 15:11:09,,2024-12-16 15:17:30,9bc9fda83d,
4,6467ly994y72gs646tp3pqb8h5culj8r,5.0,3.0,4.0,5.0,4.0,Adaptability; synthesis; innovation,,,Component/precursor production,...,,,,,completed,2024-12-16 15:06:21,,2024-12-16 15:28:42,16a405a710,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,xw0ysq3f58s1exw0ysoqb3r5hef794jo,5.0,5.0,4.0,5.0,5.0,Material Characterization; Programming; Hardwa...,,,,...,Electrochemistry; Coding; Material Science,,15.0,20.0,completed,2024-10-04 15:35:08,,2024-10-04 16:53:27,40c8e28754,
1004,5lfxzx2cx6opuqjqvop2z5lfxzx2cqxy,3.0,4.0,4.0,4.0,4.0,project management; critical thinking; communi...,,,,...,,,,,completed,2024-10-04 15:34:50,,2024-10-04 16:16:53,93d7f69a49,
1005,bsozfzu6410ytmmaebsoxknfn5ws0qui,2.0,2.0,3.0,1.0,1.0,"Manufacturing, Electrochemistry, Team building",,,,...,,,,,completed,2024-10-04 15:32:50,,2024-10-04 15:41:34,b00f834ff9,
1006,ncaxhm18wr8hssi6rt4dnbncaxhm1llr,5.0,4.0,5.0,4.0,4.0,manufacturing; software; reliability,,,,...,,,,,completed,2024-10-04 15:31:16,,2024-10-04 15:43:03,13959e4cea,


In [208]:
import pandas as pd

# Example datetime values
end_time = pd.to_datetime(df_typ['Submit Date (UTC)'].values[0])
start_time = pd.to_datetime(df_typ['Start Date (UTC)'].values[0])

# Calculate the difference
time_difference = end_time - start_time

# Convert to seconds
seconds = time_difference.total_seconds()
print(f'Time difference in seconds: {seconds}')

Time difference in seconds: 122.0


In [165]:
resp.company

{'company_satisfaction': {'keys': ['I am satisfied with my compensation',
   'I am being underpaid compared to similar roles',
   'I am satisfied with the raises and/or bonuses I have been receiving'],
  'values': array([4., 1., 4.])},
 'salary_base': np.float64(nan),
 'salary_comp_types': [],
 'salary_num_raises': '1',
 'salary_num_bonuses': '0',
 'company_years_with': np.float64(15.0),
 'company_value_chain': ['Recycling'],
 'company_stage': 'Established (significant market presence)',
 'company_country': 'Sweden',
 'company_state': nan,
 'company_days_in_office': np.float64(5.0),
 'company_headcount': '251-500',
 'company_team_count': '21-50',
 'role_title': 'Senior technical expert',
 'role_role': ['Engineering and design',
  'Investor',
  'Quality',
  'Research and development'],
 'role_level': 'Senior',
 'role_why_choose': ['Desirable work location',
  'Job security',
  'Challenge and innovation',
  'Competitive salary and benefits'],
 'role_prev_industries': 'Yes, in a different

In [170]:
resp.student

{'student_sentiment': {'keys': ['After graduating, I know what role(s) to apply to',
   'After graduating, I will find a job',
   'By the time I graduate, I will have learned the skills needed to find a job',
   'I am optimistic about the future of the battery industry'],
  'values': array([nan, nan, nan, nan])},
 'ideal_job_title': nan,
 'ideal_value_chain': [],
 'ideal_job_aspects': [],
 'ideal_salary': np.float64(nan),
 'num_internships': nan,
 'internship_value_chain': [],
 'internship_role': [],
 'internship_top_skills': nan,
 'internship_skills_wish_learned': nan,
 'internship_skills_unprepared': nan,
 'internship_hourly_pay': np.float64(nan),
 'internship_hours_per_week': np.float64(nan)}

In [175]:
resp.student

{'student_sentiment': {'keys': ['After graduating, I know what role(s) to apply to',
   'After graduating, I will find a job',
   'By the time I graduate, I will have learned the skills needed to find a job',
   'I am optimistic about the future of the battery industry'],
  'values': array([nan, nan, nan, nan])},
 'ideal_job_title': nan,
 'ideal_value_chain': [],
 'ideal_job_aspects': [],
 'ideal_salary': np.float64(nan),
 'num_internships': nan,
 'internship_value_chain': [],
 'internship_role': [],
 'internship_top_skills': nan,
 'internship_skills_wish_learned': nan,
 'internship_skills_unprepared': nan,
 'internship_hourly_pay': np.float64(nan),
 'internship_hours_per_week': np.float64(nan)}

In [22]:
df_typ[df_typ['#'] == 'xgiqw1z6r37pu305hiipxgiqw11r00jc']

Unnamed: 0,#,I feel good about what I'm working on,I feel good about my career path,I feel good about my work-life balance,I feel valued by those around me,I see opportunities for career growth,"In your opinion, what are the top three skills most in demand in the battery industry?",Mining,Refining,Component/precursor production,...,"During your previous internship, were there skills you wish you had learned but didn't? If yes, what were they?","During your previous internship, were there skills that you felt unprepared for? If yes, what were they?","During your previous internship, what was your hourly pay?","During your previous internship, how many hours per week did you work, on average?",Response Type,Start Date (UTC),Stage Date (UTC),Submit Date (UTC),Network ID,Tags
1,xgiqw1z6r37pu305hiipxgiqw11r00jc,5.0,5.0,4.0,5.0,5.0,"Chemistry, process knowledge, experience",,Refining,Component/precursor production,...,,,,,completed,2024-12-16 15:47:34,,2024-12-16 16:05:31,3289645d27,


In [115]:
df_gsh

Unnamed: 0,I feel good about what I'm working on,I feel good about my career path,I feel good about my work-life balance,I feel valued by those around me,I see opportunities for career growth,"In your opinion, what are the top three skills most in demand in the battery industry?","In opinion, which part(s) of the battery value chain are most in need of more skilled workers?",What is your highest level of education?,What did you study in school?,What country do you live in?,...,How many internships have you completed so far?,"During your previous internship, where did your employer fall on the battery value chain?","During your previous internship, what did your role involve?","During your previous internship, what are the top three skills that contributed to your success?","During your previous internship, were there skills you wish you had learned but didn't? If yes, what were they?","During your previous internship, were there skills that you felt unprepared for? If yes, what were they?","During your previous internship, what was your hourly pay?","During your previous internship, how many hours per week did you work, on average?",Submitted At,Token
0,4.0,4.0,4.0,4.0,4.0,"Scale up, the ability to keep up with and fore...","Equipment manufacturing, Recycling, Cell produ...",Doctorate,Chemistry,United States,...,,,,,,,,,10/4/2024 20:36:11,fkqh2mecyeh4ille74ccdfkqh2d4hld5
1,5.0,4.0,3.0,3.0,4.0,"Design for Reliability, Test","Cell production, Module/pack production, Produ...",Bachelors,Math,United States,...,,,,,,,,,10/4/2024 20:35:28,tv843y13jvcl3xtv843y1axjxswa7sv9
2,4.0,4.0,3.0,4.0,5.0,"Agile, Innovative, Communication","Energy infrastructure, Cell production, R&D",Bachelors,Chemical Engineering,United States,...,,,,,,,,,10/4/2024 20:34:34,rze601cdmwu0r2crze6086e9mnxhr45d
3,4.0,4.0,4.0,3.0,4.0,"design, manufacturing, and testing","Product integration (vehicles, mobility), Prod...",Masters,Mechanical Engineering,United States,...,,,,,,,,,10/4/2024 20:34:32,vy5b5ilvywropxm49vy56x9n9wsp3qpa
4,5.0,5.0,5.0,4.0,5.0,"Electrochemistry, Process, Data Analysis","Cell production, Component/precursor production",Doctorate,Materials Science and Engineering,United States,...,,,,,,,,,10/4/2024 20:31:17,f0scj5ewcwop12bqhp805raf0scj5ewc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,4.0,4.0,4.0,3.0,2.0,Cell level performance and analytics; supply c...,"Cell production, Module/pack production",Bachelors,Electrical Engineering,United States,...,,,,,,,,,12/16/2024 15:17:30,4l3byv02h9r7dwfzvpty4l3byv02821v
1004,5.0,3.0,4.0,5.0,4.0,Adaptability; synthesis; innovation,"Product integration (stationary storage), Cell...",Masters,Environmental Science,United Kingdom,...,,,,,,,,,12/16/2024 15:28:42,6467ly994y72gs646tp3pqb8h5culj8r
1005,4.0,4.0,4.0,4.0,4.0,"Technical Understanding, Software skills, Chem...","Recycling, Energy infrastructure, Software",Doctorate,Mechanical Engineering,United States,...,,,,,,,,,12/16/2024 15:40:44,rzqhmr9v6qtkeyn9yrbrzqhmrvesv26e
1006,4.0,3.0,4.0,3.0,3.0,"Process Engineering, Material Science, Program...","Component/precursor production, Cell productio...",Bachelors,Economics,United States,...,,,,,,,,,12/16/2024 16:02:03,1zx68vutnhfo9erfrl1zx68vijd4jg3g


# Clean

# Explore