# Exploration & Preprocessing: Mental Health in Tech 2016

## 1. Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

RUN = 'run_03'

df = pd.read_csv('../../data/mental-heath-in-tech-2016_20161114.csv')

print(f"Participants (rows): {df.shape[0]}")
print(f"Variables (columns): {df.shape[1]}")

Participants (rows): 1433
Variables (columns): 63


## 2. Outlier Removal

In [2]:
# Remove unrealistic age values
df = df[~df['What is your age?'].isin([3, 99, 323])]
print(f"Removed 3 rows with unrealistic ages. Remaining: {len(df)}")

Removed 3 rows with unrealistic ages. Remaining: 1430


## 3. Gender Normalization

In [3]:
# Normalize gender: 70 variants -> 3 categories (Male / Female / Other)
gender_col = 'What is your gender?'

male_variants = [
    'male', 'm', 'man', 'male ', 'male.', 'malr', 'mail', 'm|', 'dude', 
    'cis male', 'cis man', 'cisdude', 'male (cis)', 'sex is male', 
    'i\'m a man why didn\'t you make this a drop down question. you should of asked sex? and i would of answered yes please. seriously how much text can this take?'
]

female_variants = [
    'female', 'f', 'woman', 'female ', ' female', 'fem', 'fm',
    'female/woman', 'cis female ', 'cis female', 'cisgender female', 
    'cis-woman', 'female assigned at birth ', 'i identify as female.',
    'female (props for making this a freeform field, though)', 'afab'
]

def normalize_gender(val):
    if pd.isna(val):
        return np.nan
    val_lower = str(val).lower().strip()
    if val_lower in male_variants:
        return 'Male'
    elif val_lower in female_variants:
        return 'Female'
    else:
        return 'Other'

df[gender_col] = df[gender_col].apply(normalize_gender)

print("Gender normalized: 70 -> 3 categories")
print(df[gender_col].value_counts())

Gender normalized: 70 -> 3 categories
What is your gender?
Male      1055
Female     338
Other       34
Name: count, dtype: int64


## 4. Missing Values & Column Removal

In [4]:
# Remove columns with >70% missing values
threshold = 0.70
missing_pct = df.isnull().sum() / len(df)
cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()
df = df.drop(columns=cols_to_drop)
print(f"Removed {len(cols_to_drop)} columns with >70% missing values")

Removed 10 columns with >70% missing values


In [5]:
# Exclude self-employed participants (HR measures only apply to employees)
n_before = len(df)
df = df[df['Are you self-employed?'] == 0]
n_after = len(df)
print(f"Removed {n_before - n_after} self-employed participants. Remaining: {n_after}")

Removed 287 self-employed participants. Remaining: 1143


In [6]:
# Remove US State columns (too many categories, only relevant for US participants)
us_state_cols = [
    'What US state or territory do you live in?',
    'What US state or territory do you work in?'
]
df = df.drop(columns=[col for col in us_state_cols if col in df.columns])
print("Removed US State columns")

Removed US State columns


In [7]:
# Remove Previous Employer columns (dominated clustering in run_01)
previous_employer_cols = [
    'Have your previous employers provided mental health benefits?',
    'Were you aware of the options for mental health care provided by your previous employers?',
    'Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?',
    'Did your previous employers provide resources to learn more about mental health issues and how to seek help?',
    'Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?',
    'Do you think that discussing a mental health disorder with previous employers would have negative consequences?',
    'Do you think that discussing a physical health issue with previous employers would have negative consequences?',
    'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?',
    'Would you have been willing to discuss a mental health issue with your previous co-workers?',
    'Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?',
    'Did you feel that your previous employers took mental health as seriously as physical health?',
    'Do you have previous employers?'
]

cols_to_drop = [col for col in previous_employer_cols if col in df.columns]
df = df.drop(columns=cols_to_drop)
print(f"Removed {len(cols_to_drop)} Previous Employer columns")

Removed 12 Previous Employer columns


## 5. Multi-Value Columns

In [8]:
# Define category groups for mental health conditions
condition_groups = {
    'anxiety_related': [
        'Anxiety Disorder (Generalized, Social, Phobia, etc)',
        'Obsessive-Compulsive Disorder',
        'Post-traumatic Stress Disorder',
        'Stress Response Syndromes'
    ],
    'mood_related': [
        'Mood Disorder (Depression, Bipolar Disorder, etc)'
    ],
    'neurodevelopmental': [
        'Attention Deficit Hyperactivity Disorder'
    ],
    'other_conditions': [
        'Substance Use Disorder',
        'Addictive Disorder',
        'Eating Disorder (Anorexia, Bulimia, etc)',
        'Personality Disorder (Borderline, Antisocial, Paranoid, etc)',
        'Psychotic Disorder (Schizophrenia, Schizoaffective, etc)'
    ]
}

# Define category groups for work positions
position_groups = {
    'role_developer': ['Back-end Developer', 'Front-end Developer'],
    'role_ops': ['DevOps/SysAdmin', 'Support'],
    'role_leadership': ['Supervisor/Team Lead', 'Executive Leadership'],
    'role_other': ['Designer', 'Dev Evangelist/Advocate', 'One-person shop', 'Sales', 'Other']
}

In [9]:
def grouped_one_hot_encode(df, column, groups, prefix):
    """
    One-hot encode a multi-value column using predefined groups.
    """
    for group_name, categories in groups.items():
        col_name = f"{prefix}_{group_name}"
        pattern = '|'.join([cat.replace('(', r'\(').replace(')', r'\)') for cat in categories])
        df[col_name] = df[column].str.contains(pattern, na=False, regex=True).astype(int)
    df = df.drop(columns=[column])
    return df

# Mental health diagnosis columns
mental_health_cols = [
    ('If yes, what condition(s) have you been diagnosed with?', 'diagnosed_yes'),
    ('If so, what condition(s) were you diagnosed with?', 'diagnosed_pro')
]

for col, prefix in mental_health_cols:
    if col in df.columns:
        df = grouped_one_hot_encode(df, col, condition_groups, prefix)

# Apply to work position column
if 'Which of the following best describes your work position?' in df.columns:
    df = grouped_one_hot_encode(df, 'Which of the following best describes your work position?', position_groups, 'position')

print("Multi-value columns encoded")

Multi-value columns encoded


## 6. Ordinal Encoding

In [10]:
# Ordinal Encoding: Convert ordered categories to numbers

# 1. Company size
df['How many employees does your company or organization have?'] = df['How many employees does your company or organization have?'].map({
    '1-5': 1, '6-25': 2, '26-100': 3, '100-500': 4, '500-1000': 5, 'More than 1000': 6, 'Not applicable': 0
})

# 2. Medical leave difficulty  
df['If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:'] = df['If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:'].map({
    'Very easy': 1, 'Somewhat easy': 2, 'Neither easy nor difficult': 3, 'Somewhat difficult': 4, 'Very difficult': 5, "I don't know": 3, 'Not applicable': 0
})

# 3. Openness to share
df['How willing would you be to share with friends and family that you have a mental illness?'] = df['How willing would you be to share with friends and family that you have a mental illness?'].map({
    'Not open at all': 1, 'Somewhat not open': 2, 'Neutral': 3, 'Somewhat open': 4, 'Very open': 5,
    'Not applicable to me (I do not have a mental illness)': 0
})

# 4. Remote work
df['Do you work remotely?'] = df['Do you work remotely?'].map({
    'Never': 0, 'Sometimes': 1, 'Always': 2
})

# 5. Work interference when treated
df['If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?'] = df['If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?'].map({
    'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Not applicable to me': 0
})

# 6. Work interference when NOT treated
df['If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?'] = df['If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?'].map({
    'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Not applicable to me': 0
})

print("Ordinal encoding applied to 6 columns")

Ordinal encoding applied to 6 columns


## 7. Nominal Encoding

In [11]:
# Country columns: Binary encoding (USA vs. Non-USA)
df['country_live_usa'] = (df['What country do you live in?'] == 'United States of America').astype(int)
df['country_work_usa'] = (df['What country do you work in?'] == 'United States of America').astype(int)

# Drop original columns
df = df.drop(columns=['What country do you live in?', 'What country do you work in?'])
print("Country columns encoded as binary (USA vs Non-USA)")

Country columns encoded as binary (USA vs Non-USA)


In [12]:
# Remove free text columns (not used for clustering)
free_text_cols = ['Why or why not?', 'Why or why not?.1']
df = df.drop(columns=[col for col in free_text_cols if col in df.columns])

print(f"Shape before imputation: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")

Shape before imputation: (1143, 46)
Missing values: 822


## 8. Imputation (NEW in run_03)

In [13]:
from sklearn.impute import KNNImputer

missing_cols = df.columns[df.isnull().any()].tolist()

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)

numeric_missing = [col for col in numeric_cols if df[col].isnull().any()]
if numeric_missing:
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

## 9. One-Hot Encoding

In [14]:
# One-Hot Encoding for all remaining categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, drop_first=False, dtype=int)

print(f"One-hot encoding applied. Shape: {df.shape}")

One-hot encoding applied. Shape: (1143, 94)


## 10. Composite Indices (NEW in run_03)

Create composite indices that combine related features to capture higher-level concepts.

In [15]:
# 1. Mental Health Burden Index (0-1 scale)
mh_burden_cols = [
    col for col in df.columns if any(x in col for x in [
        'currently have a mental health disorder?_Yes',
        'diagnosed_yes_', 'diagnosed_pro_',
        'interferes with your work when NOT being treated'
    ])
]
if mh_burden_cols:
    df['idx_mental_health_burden'] = df[mh_burden_cols].mean(axis=1)
    print(f"Mental Health Burden Index created from {len(mh_burden_cols)} features")

# 2. Employer Support Index (0-1 scale)
support_keywords = [
    'provide mental health benefits',
    'offer resources',
    'formally discussed mental health',
    'anonymity protected',
    'takes mental health as seriously'
]
support_cols = [col for col in df.columns if any(k in col and '_Yes' in col for k in support_keywords)]
if support_cols:
    df['idx_employer_support'] = df[support_cols].mean(axis=1)
    print(f"Employer Support Index created from {len(support_cols)} features")

# 3. Stigma/Fear Index (0-1 scale)
stigma_cols = [col for col in df.columns if any(x in col for x in [
    'hurt your career?_Yes',
    'negative consequences?_Yes',
    'negative consequences?_Maybe'
])]
if stigma_cols:
    df['idx_stigma_fear'] = df[stigma_cols].mean(axis=1)
    print(f"Stigma/Fear Index created from {len(stigma_cols)} features")

# 4. Openness Index (0-1 scale)
openness_cols = [col for col in df.columns if any(x in col for x in [
    'comfortable discussing a mental health disorder with your coworkers?_Yes',
    'comfortable discussing a mental health disorder with your direct supervisor?_Yes',
    'willing would you be to share'
])]
if openness_cols:
    df['idx_openness'] = df[openness_cols].mean(axis=1)
    print(f"Openness Index created from {len(openness_cols)} features")

Mental Health Burden Index created from 10 features
Employer Support Index created from 5 features
Stigma/Fear Index created from 6 features
Openness Index created from 2 features


## 11. Feature Selection (NEW in run_03)

In [16]:
variance = df.var().sort_values()
low_var_threshold = 0.01
low_var_features = variance[variance < low_var_threshold].index.tolist()

if low_var_features:
    df = df.drop(columns=low_var_features)

In [17]:
corr_matrix = df.corr().abs()

high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > 0.9:
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

In [18]:
cols_to_remove = set()
for col1, col2, corr in high_corr_pairs:
    if col1 not in cols_to_remove:
        cols_to_remove.add(col2)

if cols_to_remove:
    df = df.drop(columns=list(cols_to_remove))

## 12. Save Preprocessed Data

In [19]:
output_dir = f'../../data/{RUN}/processed'
os.makedirs(output_dir, exist_ok=True)
output_path = f'{output_dir}/mental_health_preprocessed.csv'
df.to_csv(output_path, index=False)