# STA130 Course Project - Analysis 1 Project Proposal
***
#### **Author**: David Daniliuc<br>**Created**: Sat Nov. 2, 2024

*Jupyter Python Notebook for Analysis 1 testing and experimenting for the STA130 Course Project.* 

In [46]:

# Import statements
import pandas as pd
import numpy as np
import plotly.express as px
import scipy.stats as stats
import statsmodels.formula.api as smf

# Read and import dataset
cols = pd.read_csv("var_names.csv")
data = pd.read_csv("CSCS_data_anon.csv", na_values=["9999", "", " ", "Presented but no response", "NA"])

# Cleaning the data
empty = (data.isna().sum() == data.shape[0])
data = data[empty.index[~empty]] # keep non empty columns only

data = data[data.REMOVE_case=='No'].copy()


Columns (129,408,630,671,689,978,1001,1002,1006,1007,1008,1080,1113,1115,1116,1117,1118,1119,1120,1121,1124,1125,1126,1127,1128,1213,1214,1215,1216,1217,1218,1263,1266,1342,1343,1344,1345,1346,1347,1348,1349,1390,1391,1393,1439,1442,1463,1546,1549,1552,1555,1558,1561) have mixed types. Specify dtype option on import or set low_memory=False.



In [47]:
important_columns = [
	'WELLNESS_life_satisfaction', 'DEMO_household_income',
	'DEMO_age',
	'DEMO_occupation', # no education columns?
	'DEMO_identity_disability',
	'DEMO_relationship_status',
	'GEO_housing_live_with_children',
]
indicator_columns = important_columns[2:]

# LONELY_ucla_loneliness_scale_score
# WELLNESS_self_rated_mental_health
# WELLNESS_self_rated_physical_health
# WELLNESS_malach_pines_burnout_measure_score

# WORK_employment_impact_no_change
# COVID_vaccinated
# COVID_prevention_distancing

In [48]:
### The Correlation Between Socioeconomic Status (SES) and Mental Health During the 2021 Lockdown of the COVID-19 Pandemic

data_cross_2021 = data[data.DATASET=='2021 Cross-Sectional'][important_columns].copy().dropna()
# print(data[data.DATASET=='2022 Cross-Sectional'][['WELLNESS_life_satisfaction','DEMO_household_income','DEMO_age','DEMO_employment','DEMO_disability_none','DEMO_relationship_status','GEO_housing_live_with_children']].isna().sum())

data_cross_2021

Unnamed: 0,WELLNESS_life_satisfaction,DEMO_household_income,DEMO_age,DEMO_occupation,DEMO_identity_disability,DEMO_relationship_status,GEO_housing_live_with_children
4,8.0,"$80,000 to $89,999",30.0,"Business, finance and administration occupations",Not Selected,Single and dating,9.0
8,1.0,"$20,000 to $24,999",66.0,Sales and service occupations,Not Selected,Single and not dating,0.0
10,7.0,"$90,000 to $99,999",27.0,Sales and service occupations,Not Selected,Single and dating,0.0
30,1.0,"$5,000 to $9,999",32.0,"Business, finance and administration occupations",Not Selected,In a relationship,1.0
31,5.0,"$10,000 to $14,999",51.0,"Not applicable, I do not have an occupation",People with chronic health problems or disabil...,Single and not dating,1.0
...,...,...,...,...,...,...,...
11379,7.0,"$10,000 to $14,999",56.0,"Occupations in art, culture, recreation and sport",People with chronic health problems or disabil...,Single and not dating,0.0
11393,4.0,"Under $5,000",21.0,"Business, finance and administration occupations",Not Selected,Single and dating,6.0
11396,3.0,"$15,000 to $19,999",62.0,Health occupations,People with chronic health problems or disabil...,Single and not dating,0.0
11398,7.0,"$35,000 to $39,999",65.0,"Occupations in education, law and social, comm...",Not Selected,Single and not dating,0.0


In [49]:
income_mapping = {
    'Under $5,000': 0,
    '$5,000 to $9,999': 1,
    '$10,000 to $14,999': 2,
    '$15,000 to $19,999': 3,
    '$20,000 to $24,999': 4,
    '$25,000 to $29,999': 5,
    '$30,000 to $34,999': 6,
    '$35,000 to $39,999': 7,
    '$40,000 to $44,999': 8,
    '$45,000 to $49,999': 9,
    '$50,000 to $59,999': 10,
    '$60,000 to $69,999': 11,
    '$70,000 to $79,999': 12,
    '$80,000 to $89,999': 13,
    '$90,000 to $99,999': 14,
    '$100,000 to $109,999': 15,
    '$110,000 to $119,999': 16,
    '$120,000 to $129,999': 17,
    '$130,000 to $139,999': 18,
    '$140,000 to $149,999': 19,
    '$150,000 to $159,999': 20,
    '$160,000 to $169,999': 21,
    '$170,000 to $179,999': 22,
    '$180,000 to $189,999': 23,
    '$190,000 to $199,999': 24,
    '$200,000 or more': 25,
}

# Ensure 'DEMO_household_income' column is mapped to numeric values using income_mapping
data_cross_2021['DEMO_household_income_index'] = data_cross_2021['DEMO_household_income'].map(income_mapping)

income_categories = list(income_mapping.keys())
data_cross_2021['DEMO_household_income'] = pd.Categorical(
    data_cross_2021['DEMO_household_income'], 
    categories=income_categories, 
    ordered=True
)

In [None]:
df_count = data_cross_2021.groupby(['DEMO_household_income', 'WELLNESS_life_satisfaction']).size().reset_index(name='Count')

# Unique sort to preserve the order of life satisfaction while also sorting the household income
for value in income_mapping.keys():
	df_count = pd.concat([df_count, df_count.loc[df_count['DEMO_household_income'] == value]], ignore_index=True)
df_count = df_count.iloc[len(df_count) // 2 :]

# Calculate the percentage directly in the DataFrame and scale it to 0-1
df_count['Percentage'] = df_count.groupby('DEMO_household_income')['Count'].transform(lambda x: x / x.sum())

# Create a stacked bar graph
fig = px.bar(df_count, x='DEMO_household_income', y='Percentage', color='WELLNESS_life_satisfaction',
             title='100% Stacked Bar Graph of Categories by Score',
             labels={'Percentage': 'Percentage (%)'},
             color_continuous_scale=px.colors.sequential.Plasma,
			)

# Update layout
fig.update_layout(
    barmode='stack',
    yaxis=dict(
        tickvals=[i / 100 for i in range(0, 101, 10)],
        ticktext=[f"{i}%" for i in range(0, 101, 10)],
        dtick=0.1,  # Add grid lines every 10%
        gridcolor='rgba(0, 0, 0, 0.3)'
    )
)

# Show the figure
fig.show()







In [51]:
satisfaction_per_income = data_cross_2021.groupby('DEMO_household_income')['WELLNESS_life_satisfaction'].agg(
    Sample_Size='count',
    Mean='mean',
    Standard_Deviation='std',
    Median='median'
).reset_index()

# Add 'DEMO_household_income_index' and sort.
satisfaction_per_income.insert(1, 'DEMO_household_income_index', satisfaction_per_income['DEMO_household_income'].map(income_mapping))

satisfaction_per_income = satisfaction_per_income.sort_values('DEMO_household_income_index')

# Display the sorted data
# satisfaction_per_income





In [52]:
# Create a contingency table
contingency_table = pd.crosstab(data_cross_2021['DEMO_household_income'], 
                                 data_cross_2021['WELLNESS_life_satisfaction'])

# Run the Chi-Squared Test
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

# Print the results
print(f"Chi-Squared Statistic: {chi2}, P-value = {p}")

Chi-Squared Statistic: 287.3022512737428, P-value = 0.0031337256106074417


In [58]:
simple_model = smf.ols('WELLNESS_life_satisfaction ~ C(DEMO_household_income)', data=data_cross_2021).fit()
print("Simple Model Summary:\n", simple_model.summary())

Simple Model Summary:
                                 OLS Regression Results                                
Dep. Variable:     WELLNESS_life_satisfaction   R-squared:                       0.038
Model:                                    OLS   Adj. R-squared:                  0.024
Method:                         Least Squares   F-statistic:                     2.677
Date:                        Wed, 27 Nov 2024   Prob (F-statistic):           1.45e-05
Time:                                13:38:43   Log-Likelihood:                -3793.6
No. Observations:                        1701   AIC:                             7639.
Df Residuals:                            1675   BIC:                             7781.
Df Model:                                  25                                         
Covariance Type:                    nonrobust                                         
                                                       coef    std err          t      P>|t|      [0.025   

In [84]:
p_values = simple_model.pvalues.drop('Intercept')

plot_data = pd.DataFrame({'Income Category': income_categories[1:], 'P-value': p_values.values})

fig = px.scatter(
    plot_data,
    x='Income Category',
    y='P-value',
    title='P-values for Household Income Categories',
    template='plotly_dark',
)

fig.add_scatter(
    x=plot_data['Income Category'],
    y=plot_data['P-value'],
    mode='lines',
    line=dict(color='skyblue', width=2),
    showlegend=False,
)

fig.add_hline(
    y=0.05,
    line_dash="dash",
    line_color="red",
    annotation_text="Significance Threshold (0.05)",
    annotation_position="bottom right"
)

# Render the plot
fig.show()
