# Exploration & Preprocessing: Mental Health in Tech 2016

## 1. Data Exploration

In [4]:
import pandas as pd

# Load data
df = pd.read_csv('../data/mental-heath-in-tech-2016_20161114.csv')

# Basic information
print(f"Participants (rows): {df.shape[0]}")
print(f"Variables (columns): {df.shape[1]}")

Participants (rows): 1433
Variables (columns): 63


## 2. Data Type Analysis

In [5]:
# Display data types of all columns as DataFrame
dtype_df = pd.DataFrame({
    'Data Type': df.dtypes.values,
    'Column Name': df.columns
})

display(dtype_df)

Unnamed: 0,Data Type,Column Name
0,int64,Are you self-employed?
1,object,How many employees does your company or organi...
2,float64,Is your employer primarily a tech company/orga...
3,float64,Is your primary role within your company relat...
4,object,Does your employer provide mental health benef...
...,...,...
58,object,What US state or territory do you live in?
59,object,What country do you work in?
60,object,What US state or territory do you work in?
61,object,Which of the following best describes your wor...


## 3. Outlier Analysis

In [6]:
# Age column analysis
age_col = 'What is your age?'
ages = df[age_col]

# Find suspicious values
print("\nSuspicious values (age < 18 or age > 70):")
suspicious = df[(ages < 18) | (ages > 70)][age_col]
print(suspicious.values)


Suspicious values (age < 18 or age > 70):
[ 17  99 323   3  15  74]


In [7]:
# Remove unrealistic age values
df = df[~df['What is your age?'].isin([3, 99, 323])]

## 4. Data Consistency

In [8]:
# Gender: Show all unique values
gender_col = 'What is your gender?'
print(f"Gender - {df[gender_col].nunique()} unique values:\n")
print(df[gender_col].value_counts())

Gender - 69 unique values:

What is your gender?
Male                                       608
male                                       249
Female                                     153
female                                      95
M                                           86
                                          ... 
female-bodied; no feelings about gender      1
cis man                                      1
AFAB                                         1
Transgender woman                            1
MALE                                         1
Name: count, Length: 69, dtype: int64


In [9]:
import numpy as np

# Normalize gender: 70 variants -> 3 categories (Male / Female / Other)
male_variants = [
    'male', 'm', 'man', 'male ', 'male.', 'malr', 'mail', 'm|', 'dude', 
    'cis male', 'cis man', 'cisdude', 'male (cis)', 'sex is male', 
    'i\'m a man why didn\'t you make this a drop down question. you should of asked sex? and i would of answered yes please. seriously how much text can this take?'
]

female_variants = [
    'female', 'f', 'woman', 'female ', ' female', 'fem', 'fm',
    'female/woman', 'cis female ', 'cis female', 'cisgender female', 
    'cis-woman', 'female assigned at birth ', 'i identify as female.',
    'female (props for making this a freeform field, though)', 'afab'
]

def normalize_gender(val):
    if pd.isna(val):
        return np.nan
    val_lower = str(val).lower().strip()
    if val_lower in male_variants:
        return 'Male'
    elif val_lower in female_variants:
        return 'Female'
    else:
        return 'Other'

df[gender_col] = df[gender_col].apply(normalize_gender)

print("Gender normalized: 70 -> 3 categories\n")
print(df[gender_col].value_counts())

Gender normalized: 70 -> 3 categories

What is your gender?
Male      1055
Female     338
Other       34
Name: count, dtype: int64


## 5. Missing Values

In [10]:
# Missing values per column
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing_pct = (missing / len(df) * 100).round(1)

print(f"Columns with missing values: {len(missing)}\n")
for col in missing.index:
    print(f"{missing[col]:4} {missing_pct[col]:5.1f}% - {col}")

Columns with missing values: 44

1286  89.9% - If you have revealed a mental health issue to a client or business contact, do you believe this has impacted you negatively?
1226  85.7% - If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?
1167  81.6% - Is your primary role within your company related to tech/IT?
1143  79.9% - Do you know local or online resources to seek help for a mental health disorder?
1143  79.9% - If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?
1143  79.9% - If you have revealed a mental health issue to a coworker or employee, do you believe this has impacted you negatively?
1143  79.9% - Do you believe your productivity is ever affected by a mental health issue?
1143  79.9% - If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?
1143  79.9% - Do

In [11]:
# Remove columns with >70% missing values
threshold = 0.70
missing_pct = df.isnull().sum() / len(df)
cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()

df = df.drop(columns=cols_to_drop)

## 6. Multi-Value Columns

In [12]:
# Define category groups for mental health conditions
condition_groups = {
    'anxiety_related': [
        'Anxiety Disorder (Generalized, Social, Phobia, etc)',
        'Obsessive-Compulsive Disorder',
        'Post-traumatic Stress Disorder',
        'Stress Response Syndromes'
    ],
    'mood_related': [
        'Mood Disorder (Depression, Bipolar Disorder, etc)'
    ],
    'neurodevelopmental': [
        'Attention Deficit Hyperactivity Disorder'
    ],
    'other_conditions': [
        'Substance Use Disorder',
        'Addictive Disorder',
        'Eating Disorder (Anorexia, Bulimia, etc)',
        'Personality Disorder (Borderline, Antisocial, Paranoid, etc)',
        'Psychotic Disorder (Schizophrenia, Schizoaffective, etc)'
    ]
}

# Define category groups for work positions
position_groups = {
    'role_developer': [
        'Back-end Developer',
        'Front-end Developer'
    ],
    'role_ops': [
        'DevOps/SysAdmin',
        'Support'
    ],
    'role_leadership': [
        'Supervisor/Team Lead',
        'Executive Leadership'
    ],
    'role_other': [
        'Designer',
        'Dev Evangelist/Advocate',
        'One-person shop',
        'Sales',
        'Other'
    ]
}


In [13]:
def grouped_one_hot_encode(df, column, groups, prefix):
    """
    One-hot encode a multi-value column using predefined groups.
    Uses vectorized string operations for better performance.
    """
    # Create binary column for each group
    for group_name, categories in groups.items():
        col_name = f"{prefix}_{group_name}"
        
        # Use vectorized str.contains with regex pattern
        pattern = '|'.join([cat.replace('(', r'\(').replace(')', r'\)') for cat in categories])
        df[col_name] = df[column].str.contains(pattern, na=False, regex=True).astype(int)
    
    # Drop original column
    df = df.drop(columns=[column])
    
    return df

# Mental health diagnosis columns (2 remaining after missing value cleanup)
mental_health_cols = [
    ('If yes, what condition(s) have you been diagnosed with?', 'diagnosed_yes'),
    ('If so, what condition(s) were you diagnosed with?', 'diagnosed_pro')
]
# Note: "If maybe..." was removed due to >70% missing values

for col, prefix in mental_health_cols:
    df = grouped_one_hot_encode(df, col, condition_groups, prefix)

# Apply to work position column
df = grouped_one_hot_encode(
    df, 
    'Which of the following best describes your work position?',
    position_groups,
    'position'
)

## 7. Free Text Handling

## 8. Save Data