## Data Exploration

In [1]:
import pandas as pd 
import openpyxl as px

In [2]:
# Load datasets
tasks_df = pd.read_csv("../original_data/onet_task_statements.csv") 
occ_metadata_df = pd.read_excel("../new_onet_data/excel/occupation_level_metadata.xlsx", engine="openpyxl")
occ_metadata_df.to_csv("../new_onet_data/csv/occupation_level_metadata.csv", index=False)


# Filter to Incumbent and Occupational Expert tasks and get unique job titles
relevant_tasks = tasks_df[tasks_df['Domain Source'].isin(['Incumbent', 'Occupational Expert'])][['Title', 'Domain Source']].drop_duplicates().reset_index(drop=True)


# Filter only rows about experience breakdown
occ_metadata_exp_incumbant = occ_metadata_df[occ_metadata_df['Item'] == "How Long at Current Job"]
occ_metadata_exp_expert = occ_metadata_df[occ_metadata_df['Item'] == "How Much Experience Performing Work in this Occupation"]



#Pivot it to get a clean format: one row per job title, columns are experience bins
occ_metadata_exp_incumbant_pivot = occ_metadata_exp_incumbant.pivot_table(
    index=['O*NET-SOC Code', 'Title'],
    columns='Response',  # e.g., '1-2 Years', '10+ Years', etc.
    values='Percent',          # Use 'N' for count, or 'Percent' for percentage
    aggfunc='first'      # Use first if it's guaranteed 1 entry per combo
).reset_index()

#Pivot it to get a clean format: one row per job title, columns are experience bins
occ_metadata_exp_expert_pivot = occ_metadata_exp_expert.pivot_table(
    index=['O*NET-SOC Code', 'Title'],
    columns='Response',  # e.g., '1-2 Years', '10+ Years', etc.
    values='Percent',          # Use 'N' for count, or 'Percent' for percentage
    aggfunc='first'      # Use first if it's guaranteed 1 entry per combo
).reset_index()



# Merge source info onto pivoted experience table
merged_df_incumbant = pd.merge(occ_metadata_exp_incumbant_pivot, relevant_tasks, on='Title', how='left')
merged_df_expert = pd.merge(occ_metadata_exp_expert_pivot, relevant_tasks, on='Title', how='left')

occ_level_df = pd.read_excel("../new_onet_data/excel/job_zones.xlsx", engine="openpyxl")
occ_level_df.to_csv("../new_onet_data/csv/job_zones.csv", index=False)

merged_df_incumbant = pd.merge(merged_df_incumbant, occ_level_df, on='Title', how='left')
merged_df_expert = pd.merge(merged_df_expert, occ_level_df, on='Title', how='left')

# Sort by '10+ Years' experience column 
sorted_df_incumbant = merged_df_incumbant.sort_values(by='10 Years or More', ascending=False)
sorted_df_incumbant.to_csv("../new_data/domain_source_breakdown_incumbant.csv", index=False)

sorted_df_expert = merged_df_expert.sort_values(by='10+ Years', ascending=False)
sorted_df_expert.to_csv("../new_data/domain_source_breakdown_expert.csv", index=False)

In [3]:
expert_level_avg = sorted_df_expert["Job Zone"].mean()
incumbant_level_avg = sorted_df_incumbant["Job Zone"].mean()
print(f"Average Job Zone for Expert Tasks: {expert_level_avg}")
print(f"Average Job Zone for Incumbant Tasks: {incumbant_level_avg}")

# For expert tasks
print("Expert Job Zone counts:")
print(sorted_df_expert["Job Zone"].value_counts())

# For incumbent tasks
print("Incumbent Job Zone counts:")
print(sorted_df_incumbant["Job Zone"].value_counts())

# For expert tasks
print("Expert Job Zone percentage breakdown:")
print((sorted_df_expert["Job Zone"].value_counts(normalize=True) * 100).round(2))

# For incumbent tasks
print("Incumbent Job Zone percentage breakdown:")
print((sorted_df_incumbant["Job Zone"].value_counts(normalize=True) * 100).round(2))

Average Job Zone for Expert Tasks: 4.01025641025641
Average Job Zone for Incumbant Tasks: 2.9306184012066363
Expert Job Zone counts:
Job Zone
4    93
5    58
3    33
2    10
1     1
Name: count, dtype: int64
Incumbent Job Zone counts:
Job Zone
2    272
3    170
4    100
5     92
1     29
Name: count, dtype: int64
Expert Job Zone percentage breakdown:
Job Zone
4    47.69
5    29.74
3    16.92
2     5.13
1     0.51
Name: proportion, dtype: float64
Incumbent Job Zone percentage breakdown:
Job Zone
2    41.03
3    25.64
4    15.08
5    13.88
1     4.37
Name: proportion, dtype: float64


In [4]:

from scipy.stats import ttest_ind

expert_zones = sorted_df_expert["Job Zone"].dropna()
incumbent_zones = sorted_df_incumbant["Job Zone"].dropna()

t_stat, p_val = ttest_ind(expert_zones, incumbent_zones, equal_var=False)

print(f"T-statistic: {t_stat:.3f}, p-value: {p_val:.5f}")

from scipy.stats import chi2_contingency

# Build contingency table
job_zone_dist = pd.crosstab(index=sorted_df_expert["Job Zone"], columns="Expert")
job_zone_dist["Incumbent"] = sorted_df_incumbant["Job Zone"].value_counts()

# Run test
chi2, p, dof, expected = chi2_contingency(job_zone_dist.fillna(0))

print(f"Chi2: {chi2:.3f}, p-value: {p:.5f}")

T-statistic: 14.372, p-value: 0.00000
Chi2: 163.276, p-value: 0.00000


In [None]:
df_expert = sorted_df_expert[["Title", "Job Zone"]].copy()
df_expert["Domain_Source"] = "Expert"

df_incumbent = sorted_df_incumbant[["Title", "Job Zone"]].copy()
df_incumbent["Domain_Source"] = "Incumbent"

combined_df = pd.concat([df_expert, df_incumbent], ignore_index=True)
combined_df = combined_df.dropna(subset=["Job Zone"])

# Binary encode
combined_df["Source_Binary"] = (combined_df["Domain_Source"] == "Expert").astype(int)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import statsmodels.api as sm






X = combined_df[["Job Zone"]]
y = combined_df["Source_Binary"]

model = sm.Logit(y, X).fit()
print(model.summary())

model = LogisticRegression()
model.fit(X, y)

print(f"Coefficient: {model.coef_[0][0]:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")



# Optional: Evaluate
preds = model.predict(X)
print(classification_report(y, preds))

X = sm.add_constant(combined_df[["Job Zone"]])
y = combined_df["Source_Binary"]


Optimization terminated successfully.
         Current function value: 0.457489
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:          Source_Binary   No. Observations:                  858
Model:                          Logit   Df Residuals:                      856
Method:                           MLE   Df Model:                            1
Date:                Thu, 31 Jul 2025   Pseudo R-squ.:                  0.1464
Time:                        16:32:07   Log-Likelihood:                -392.53
converged:                       True   LL-Null:                       -459.85
Covariance Type:            nonrobust   LLR p-value:                 3.926e-31
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.2915      0.327    -13.141      0.000      -4.932      -3.651
Job Zone       0.8811      0.