# Analyzing Recently Accepted CS Professionals

Goal: Extract characteristics of people who recently got accepted into CS jobs to understand what makes a successful candidate in today's market.

In [26]:
import pandas as pd
import os

# From notebook, go up one level to project root, then into data
df = pd.read_csv('../data/stack-overflow-developer-survey-2025/survey_results_public.csv')

# Or using absolute path
# df = pd.read_csv('d:/McGill/Comp551/jobfull/data/survey_results_public.csv')

# View first few rows
df.head()

  df = pd.read_csv('../data/stack-overflow-developer-survey-2025/survey_results_public.csv')


Unnamed: 0,ResponseId,MainBranch,Age,EdLevel,Employment,EmploymentAddl,WorkExp,LearnCodeChoose,LearnCode,LearnCodeAI,...,AIAgentOrchestration,AIAgentOrchWrite,AIAgentObserveSecure,AIAgentObsWrite,AIAgentExternal,AIAgentExtWrite,AIHuman,AIOpen,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,25-34 years old,"Masterâ€™s degree (M.A., M.S., M.Eng., MBA, etc.)",Employed,"Caring for dependents (children, elderly, etc.)",8.0,"Yes, I am not new to coding but am learning ne...",Online Courses or Certification (includes all ...,"Yes, I learned how to use AI-enabled tools for...",...,Vertex AI,,,,ChatGPT,,When I donâ€™t trust AIâ€™s answers,"Troubleshooting, profiling, debugging",61256.0,10.0
1,2,I am a developer by profession,25-34 years old,"Associate degree (A.A., A.S., etc.)",Employed,,2.0,"Yes, I am not new to coding but am learning ne...",Online Courses or Certification (includes all ...,"Yes, I learned how to use AI-enabled tools for...",...,,,,,,,When I donâ€™t trust AIâ€™s answers;When I want to...,All skills. AI is a flop.,104413.0,9.0
2,3,I am a developer by profession,35-44 years old,"Bachelorâ€™s degree (B.A., B.S., B.Eng., etc.)","Independent contractor, freelancer, or self-em...",None of the above,10.0,"Yes, I am not new to coding but am learning ne...",Online Courses or Certification (includes all ...,"Yes, I learned how to use AI-enabled tools for...",...,,,,,ChatGPT;Claude Code;GitHub Copilot;Google Gemini,,When I donâ€™t trust AIâ€™s answers;When I want to...,"Understand how things actually work, problem s...",53061.0,8.0
3,4,I am a developer by profession,35-44 years old,"Bachelorâ€™s degree (B.A., B.S., B.Eng., etc.)",Employed,None of the above,4.0,"Yes, I am not new to coding but am learning ne...","Other online resources (e.g. standard search, ...","Yes, I learned how to use AI-enabled tools for...",...,,,,,ChatGPT;Claude Code,,When I donâ€™t trust AIâ€™s answers;When I want to...,,36197.0,6.0
4,5,I am a developer by profession,35-44 years old,"Masterâ€™s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em...","Caring for dependents (children, elderly, etc.)",21.0,"No, I am not new to coding and did not learn n...",,"Yes, I learned how to use AI-enabled tools for...",...,,,,,,,When I donâ€™t trust AIâ€™s answers,"critical thinking, the skill to define the tas...",60000.0,7.0


In [27]:
# Explore dataset structure
print(f"Dataset shape: {df.shape}")
print(f"\nTotal columns: {len(df.columns)}")
print(f"\nSample column names (first 20):")
for i, col in enumerate(df.columns[:20], 1):
    print(f"  {i}. {col}")

Dataset shape: (49191, 172)

Total columns: 172

Sample column names (first 20):
  1. ResponseId
  2. MainBranch
  3. Age
  4. EdLevel
  5. Employment
  6. EmploymentAddl
  7. WorkExp
  8. LearnCodeChoose
  9. LearnCode
  10. LearnCodeAI
  11. AILearnHow
  12. YearsCode
  13. DevType
  14. OrgSize
  15. ICorPM
  16. RemoteWork
  17. PurchaseInfluence
  18. TechEndorseIntro
  19. TechEndorse_1
  20. TechEndorse_2


In [28]:
# Look for key columns related to employment, experience, and job search
key_columns = [col for col in df.columns if any(keyword in col.lower() 
    for keyword in ['employ', 'job', 'work', 'year', 'experience', 'edu', 'code'])]

print("Relevant columns for our analysis:")
for col in key_columns:
    print(f"\nâ€¢ {col}")
    unique_vals = df[col].dropna().unique()
    if len(unique_vals) <= 10:
        print(f"  Values: {unique_vals}")
    else:
        print(f"  Sample values: {unique_vals[:5]}")

Relevant columns for our analysis:

â€¢ Employment
  Values: <StringArray>
[                                            'Employed',
 'Independent contractor, freelancer, or self-employed',
                                              'Student',
                                              'Retired',
                                         'Not employed',
                                  'I prefer not to say']
Length: 6, dtype: str

â€¢ EmploymentAddl
  Sample values: <StringArray>
[                                                        'Caring for dependents (children, elderly, etc.)',
                                                                                       'None of the above',
                                'Caring for dependents (children, elderly, etc.);Volunteering (regularly)',
 'Engaged in paid work (20-29 hours per week);Transitioning to retirement (gradually reducing work hours)',
   'Attending school (full-time);Attending school (part-time);Engaged in paid 

## Step 1: Identify Column Names

In [29]:
# Check if key columns exist
employment_col = 'Employment' if 'Employment' in df.columns else None
experience_col = 'YearsCodePro' if 'YearsCodePro' in df.columns else 'YearsCode' if 'YearsCode' in df.columns else None
country_col = 'Country' if 'Country' in df.columns else None
dev_type_col = 'DevType' if 'DevType' in df.columns else None

print(f"Employment column: {employment_col}")
print(f"Experience column: {experience_col}")
print(f"Country column: {country_col}")
print(f"Developer type column: {dev_type_col}")

if employment_col:
    print(f"\nEmployment statuses available:")
    for status in df[employment_col].unique()[:10]:
        print(f"  - {status}")
        
if experience_col:
    print(f"\nExperience levels available:")
    for exp in df[experience_col].unique()[:10]:
        print(f"  - {exp}")

Employment column: Employment
Experience column: YearsCode
Country column: Country
Developer type column: DevType

Employment statuses available:
  - Employed
  - Independent contractor, freelancer, or self-employed
  - Student
  - Retired
  - Not employed
  - I prefer not to say
  - nan

Experience levels available:
  - 14.0
  - 10.0
  - 12.0
  - 5.0
  - 22.0
  - 20.0
  - 13.0
  - 30.0
  - 15.0
  - 9.0


## Step 2: Filter for Recent CS Hires (0-2 years experience)

In [30]:
# Create filter for recent CS hires
from collections import Counter

print("Creating filters for recently hired CS professionals...")
print("="*60)

recent_hires = df.copy()
print(f"Starting with: {len(recent_hires)} total respondents")

# Filter 1: Employment status (employed full-time)
if employment_col:
    recent_hires = recent_hires[recent_hires[employment_col].str.contains('Employed', na=False, case=False)]
    print(f"After employment filter: {len(recent_hires)} respondents")

# Filter 2: Experience (0-2 years for recent hires)
if experience_col:
    # Handle different formats
    recent_hires = recent_hires[
        recent_hires[experience_col].isin(['Less than 1 year', '1-2 years', '0-2 years']) |
        recent_hires[experience_col].astype(str).str.contains('Less than 1|^1|^2', na=False, regex=True)
    ]
    print(f"After experience filter (0-2 years): {len(recent_hires)} respondents")

# Create Canada subset
canada_hires = pd.DataFrame()
if country_col:
    canada_hires = recent_hires[recent_hires[country_col] == 'Canada'].copy()
    print(f"Recent hires in Canada: {len(canada_hires)} respondents")
    
print(f"\nâœ“ Total recent CS hires to analyze: {len(recent_hires)}")
print(f"âœ“ Recent CS hires in Canada: {len(canada_hires)}")

Creating filters for recently hired CS professionals...
Starting with: 49191 total respondents
After employment filter: 42685 respondents
After experience filter (0-2 years): 22259 respondents
Recent hires in Canada: 680 respondents

âœ“ Total recent CS hires to analyze: 22259
âœ“ Recent CS hires in Canada: 680


## Step 3: Analyze Successful Candidate Profiles

In [31]:
# Analyze education level
edu_col = 'EdLevel' if 'EdLevel' in df.columns else None

if edu_col and edu_col in recent_hires.columns:
    print("ðŸ“š EDUCATION LEVEL of Recent Hires:")
    print("="*60)
    edu_dist = recent_hires[edu_col].value_counts()
    for edu, count in edu_dist.items():
        pct = (count / len(recent_hires)) * 100
        print(f"{edu}: {count} ({pct:.1f}%)")
    print()
    
    # Compare with Canada specifically
    if not canada_hires.empty:
        print("\nðŸ“š EDUCATION in Canada specifically:")
        print("-"*60)
        canada_edu = canada_hires[edu_col].value_counts()
        for edu, count in canada_edu.items():
            pct = (count / len(canada_hires)) * 100
            print(f"{edu}: {count} ({pct:.1f}%)")
else:
    print(f"Education column not found or empty")

ðŸ“š EDUCATION LEVEL of Recent Hires:
Bachelorâ€™s degree (B.A., B.S., B.Eng., etc.): 9501 (42.7%)
Masterâ€™s degree (M.A., M.S., M.Eng., MBA, etc.): 6946 (31.2%)
Some college/university study without earning a degree: 2427 (10.9%)
Professional degree (JD, MD, Ph.D, Ed.D, etc.): 1314 (5.9%)
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.): 1010 (4.5%)
Associate degree (A.A., A.S., etc.): 686 (3.1%)
Other (please specify):: 198 (0.9%)
Primary/elementary school: 151 (0.7%)


ðŸ“š EDUCATION in Canada specifically:
------------------------------------------------------------
Bachelorâ€™s degree (B.A., B.S., B.Eng., etc.): 343 (50.4%)
Masterâ€™s degree (M.A., M.S., M.Eng., MBA, etc.): 131 (19.3%)
Some college/university study without earning a degree: 86 (12.6%)
Associate degree (A.A., A.S., etc.): 49 (7.2%)
Professional degree (JD, MD, Ph.D, Ed.D, etc.): 31 (4.6%)
Other (please specify):: 21 (3.1%)
Secondary school (e.g. American high school, German Realsc

In [32]:
# Analyze programming languages and technologies
lang_col = 'LanguageHaveWorkedWith' if 'LanguageHaveWorkedWith' in df.columns else None

if lang_col and lang_col in recent_hires.columns:
    print("ðŸ’» TOP PROGRAMMING LANGUAGES for Recent Hires:")
    print("="*60)
    
    # Split semicolon-separated values and count
    all_langs = []
    for langs in recent_hires[lang_col].dropna():
        all_langs.extend([l.strip() for l in str(langs).split(';')])
    
    lang_counts = Counter(all_langs)
    
    print(f"Total respondents with language data: {recent_hires[lang_col].notna().sum()}")
    print("\nTop 15 Languages/Technologies:")
    for lang, count in lang_counts.most_common(15):
        pct = (count / recent_hires[lang_col].notna().sum()) * 100
        print(f"  {lang}: {count} ({pct:.1f}%)")
else:
    print("Language column not found")

ðŸ’» TOP PROGRAMMING LANGUAGES for Recent Hires:
Total respondents with language data: 16545

Top 15 Languages/Technologies:
  JavaScript: 11136 (67.3%)
  HTML/CSS: 10209 (61.7%)
  SQL: 9904 (59.9%)
  Python: 9265 (56.0%)
  Bash/Shell (all shells): 8244 (49.8%)
  TypeScript: 7826 (47.3%)
  C#: 4644 (28.1%)
  Java: 4590 (27.7%)
  PowerShell: 3876 (23.4%)
  PHP: 3261 (19.7%)
  C++: 3237 (19.6%)
  Go: 2881 (17.4%)
  C: 2789 (16.9%)
  Rust: 2440 (14.7%)
  Kotlin: 1892 (11.4%)


In [33]:
# Analyze developer roles
if dev_type_col and dev_type_col in recent_hires.columns:
    print("ðŸ‘” DEVELOPER ROLES of Recent Hires:")
    print("="*60)
    
    all_roles = []
    for roles in recent_hires[dev_type_col].dropna():
        all_roles.extend([r.strip() for r in str(roles).split(';')])
    
    role_counts = Counter(all_roles)
    
    print(f"Total respondents with role data: {recent_hires[dev_type_col].notna().sum()}")
    print("\nTop 10 Developer Roles:")
    for role, count in role_counts.most_common(10):
        pct = (count / recent_hires[dev_type_col].notna().sum()) * 100
        print(f"  {role}: {count} ({pct:.1f}%)")
else:
    print("Developer type column not found")

ðŸ‘” DEVELOPER ROLES of Recent Hires:
Total respondents with role data: 22089

Top 10 Developer Roles:
  Developer, full-stack: 6823 (30.9%)
  Developer, back-end: 3762 (17.0%)
  Architect, software or solutions: 1562 (7.1%)
  Developer, front-end: 1010 (4.6%)
  Developer, desktop or enterprise applications: 940 (4.3%)
  Other (please specify):: 875 (4.0%)
  Developer, mobile: 753 (3.4%)
  Engineering manager: 738 (3.3%)
  Developer, embedded applications or devices: 665 (3.0%)
  DevOps engineer or professional: 596 (2.7%)


## Summary: Profile of Successfully Hired CS Professional

In [34]:
# Create a summary profile
print("="*70)
print("   PROFILE OF RECENTLY ACCEPTED CS PROFESSIONALS")
print("="*70)
print(f"\nðŸ“Š Sample Size: {len(recent_hires)} respondents (globally)")
if 'canada_hires' in globals() and not canada_hires.empty:
    print(f"ðŸ“Š Canada Sample: {len(canada_hires)} respondents")

print("\nâœ… Analysis Complete!")
print("\nThis data shows what successful recent CS hires have in common:")
print("  â€¢ Most common education levels")
print("  â€¢ Top programming languages and technologies")
print("  â€¢ Most common developer roles")
print("  â€¢ Regional differences (Canada vs Global)")
print("\nðŸŽ¯ Use this data to:")
print("  1. Calibrate your Bayesian job acceptance model")
print("  2. Understand competitive skill requirements")
print("  3. Estimate how many applications you'll need")

   PROFILE OF RECENTLY ACCEPTED CS PROFESSIONALS

ðŸ“Š Sample Size: 22259 respondents (globally)
ðŸ“Š Canada Sample: 680 respondents

âœ… Analysis Complete!

This data shows what successful recent CS hires have in common:
  â€¢ Most common education levels
  â€¢ Top programming languages and technologies
  â€¢ Most common developer roles
  â€¢ Regional differences (Canada vs Global)

ðŸŽ¯ Use this data to:
  1. Calibrate your Bayesian job acceptance model
  2. Understand competitive skill requirements
  3. Estimate how many applications you'll need


## Modeling: Bayesian Naive Bayes (Laplace Smoothing)

We train a Naive Bayes model using processed Montreal CS jobs to estimate acceptance likelihoods based on features (company type, seniority, education, experience, skills).

In [35]:
# Ensure dependencies
try:
    import sklearn
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])

In [None]:
# Load processed CS jobs (Canada-wide preferred)
import pandas as pd
from pathlib import Path

candidates = [Path('..')/ 'data' / 'processed' / 'canada_cs_jobs.csv',
              Path('..')/ 'data' / 'processed' / 'montreal_cs_jobs.csv']
processed_path = None
for p in candidates:
    if p.exists():
        processed_path = p
        break
if processed_path is None:
    raise FileNotFoundError("Processed data not found. Run: python src/preprocess_jobs.py")

print(f"Using processed dataset: {processed_path}")
jobs = pd.read_csv(processed_path)
print(jobs.shape)
jobs.head()

FileNotFoundError: Processed data not found at ..\data\processed\montreal_cs_jobs.csv.
Run: python src/preprocess_jobs.py

In [None]:
# Feature engineering: one-hot + skills bag-of-words + experience buckets
import numpy as np

# Clean columns that may be missing
for col in ['company_type','seniority_level','education','experience_years','skills']:
    if col not in jobs.columns:
        jobs[col] = np.nan

# Experience buckets
def bucket_experience(x):
    try:
        x = int(x)
    except Exception:
        return 'unknown'
    if x <= 1: return '0-1'
    if x <= 3: return '2-3'
    if x <= 5: return '4-5'
    if x <= 8: return '6-8'
    return '9+'

jobs['exp_bucket'] = jobs['experience_years'].apply(bucket_experience)

# Skills unpack (skills is a string like "a;b" or list); normalize to list
import ast

def to_list_safe(val):
    if isinstance(val, list):
        return [str(v).strip().lower() for v in val]
    if pd.isna(val):
        return []
    s = str(val)
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(v).strip().lower() for v in parsed]
    except Exception:
        pass
    # fallback: split by comma/semicolon
    return [t.strip().lower() for t in s.replace(';',',').split(',') if t.strip()]

jobs['skills_list'] = jobs['skills'].apply(to_list_safe)

# Choose top-K skills to one-hot
from collections import Counter
skill_counts = Counter([sk for row in jobs['skills_list'] for sk in row])
TOP_K = 30
top_skills = [sk for sk, _ in skill_counts.most_common(TOP_K)]
for sk in top_skills:
    jobs[f'skill__{sk}'] = jobs['skills_list'].apply(lambda lst: int(sk in lst))

# One-hot for categorical columns
cat_cols = ['company_type','seniority_level','education','exp_bucket']
jobs_cat = pd.get_dummies(jobs[cat_cols].fillna('unknown'), prefix=cat_cols, drop_first=False)

# Build X and y
y = None
if 'accepted' in jobs.columns:
    y = jobs['accepted'].astype(int)
elif 'likely_accepted' in jobs.columns:
    y = jobs['likely_accepted'].astype(int)
elif 'overall_success_rate' in jobs.columns:
    y = (jobs['overall_success_rate'] >= jobs['overall_success_rate'].median()).astype(int)
else:
    raise ValueError("No target column found. Expected 'accepted', 'likely_accepted' or 'overall_success_rate'.")

skill_cols = [c for c in jobs.columns if c.startswith('skill__')]
X = pd.concat([jobs_cat.reset_index(drop=True), jobs[skill_cols].reset_index(drop=True)], axis=1)
print(X.shape, y.shape)
X.head()

In [None]:
# Train/Test split and Naive Bayes training with Laplace smoothing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# MultinomialNB with alpha=1 (Laplace)
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)

# Predict
y_pred = nb.predict(X_test)
y_prob = nb.predict_proba(X_test)[:,1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall:", recall_score(y_test, y_pred, zero_division=0))
print("F1:", f1_score(y_test, y_pred, zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

## CLT-Based Estimates for Application Campaigns

Using predicted probabilities from the model, estimate expected acceptances and the probability of at least one offer when applying to N jobs.

In [None]:
# Compute CLT-based summary for a set of application probabilities
import numpy as np

# Use the model probabilities from X_test as an example campaign
p_list = y_prob  # probabilities for each application-like row

mu = float(np.mean(p_list))
sigma2 = float(np.var(p_list, ddof=0))
N = len(p_list)

# Expected acceptances
expected_acceptances = float(np.sum(p_list))

# Probability of at least one acceptance (exact under independence)
p_at_least_one = float(1 - np.prod(1 - p_list))

# CLT 95% CI for total acceptances ~ Normal(sum p_i, sum p_i(1-p_i))
var_sum = float(np.sum(p_list * (1 - p_list)))
std_sum = np.sqrt(var_sum)
ci_low = expected_acceptances - 1.96 * std_sum
ci_high = expected_acceptances + 1.96 * std_sum

print(f"Campaign size (N): {N}")
print(f"Mean single-app acceptance prob: {mu:.3f}")
print(f"Expected total acceptances: {expected_acceptances:.2f}")
print(f"P(at least one acceptance): {p_at_least_one:.2%}")
print(f"95% CI for total acceptances: [{ci_low:.2f}, {ci_high:.2f}]")

## Predict for Your Profile

Fill in your profile (skills, education, experience, target company type/seniority) to estimate per-application probability and expected applications needed.

In [None]:
# Define your profile
user_profile = {
    'company_type': 'Startup',  # FAANG, Big Tech, Startup, Mid-size/Other
    'seniority_level': 'Entry', # Entry, Mid, Senior
    'education': "Bachelor's", # PhD, Master's, Bachelor's, Diploma, Not Specified
    'experience_years': 1,
    'skills': ['python','sql','aws','docker']
}

# Build a single-row DataFrame matching training schema
prof_df = pd.DataFrame([{
    'company_type': user_profile['company_type'],
    'seniority_level': user_profile['seniority_level'],
    'education': user_profile['education'],
    'exp_bucket': bucket_experience(user_profile['experience_years'])
}])

prof_X = pd.get_dummies(prof_df[cat_cols].fillna('unknown'), prefix=cat_cols, drop_first=False)
# Align columns with training
for col in jobs_cat.columns:
    if col not in prof_X.columns:
        prof_X[col] = 0
prof_X = prof_X[jobs_cat.columns]

# Add skill columns
prof_skill_vec = {}
user_skills_norm = [s.lower() for s in user_profile['skills']]
for sk in top_skills:
    prof_skill_vec[f'skill__{sk}'] = int(sk in user_skills_norm)
prof_skill_df = pd.DataFrame([prof_skill_vec])

# Final feature row aligned with X
prof_X_full = pd.concat([prof_X.reset_index(drop=True), prof_skill_df], axis=1)
for col in X.columns:
    if col not in prof_X_full.columns:
        prof_X_full[col] = 0
prof_X_full = prof_X_full[X.columns]

# Predict
prob_accept = float(nb.predict_proba(prof_X_full)[0,1])
expected_apps_for_one_offer = int(np.ceil(np.log(1-0.95)/np.log(1-prob_accept))) if prob_accept>0 else 999

print(f"Predicted acceptance probability per application: {prob_accept:.2%}")
print(f"Applications for 95% chance of â‰¥1 offer: {expected_apps_for_one_offer}")