What is the difference between peoples needs and actual time gotten? How does it affect their loneliness scores?

Variables:
- CONNECTION_preference_time_friends (wanted per week)
- CONNECTION_social_time_friends_p7d (actual per week)

In [51]:
import pandas as pd
cols = pd.read_csv("../var_names.csv")
data = pd.read_csv('../CSCS_data_anon.csv', low_memory=False,
                        na_values=["9999", "", " ", "Presented but no response", "NA"])
empty = (data.isna().sum()==data.shape[0])
data = data[empty.index[~empty]] # keep non empty columns only
data = data[data.REMOVE_case=='No'].copy()

In [64]:
def get_matching_keywords(keywords: list[str], group_restriction: list[str]=None, year_restriction: list[int]=None, strict: bool=False) -> list[str]:
    """
    Returns the names of the columns whose string contains any of the keywords
    Can restrict to specific years by passing a list of years to year restriction
    Same for group restriction
    Strict means that a variable must be found in all years and all groups listed, non strict means it only needs to satisfy one year or one group
    """
    global data
    assert group_restriction == None or all({res.lower() in [None, "cohort", "cross"] for res in group_restriction})
    assert year_restriction == None or all({str(res) in [None, "2021", "2022", "2023"] for res in year_restriction})
    matching = [col for col in data.columns if any(keyword.lower() in col.lower() for keyword in keywords)]
    if group_restriction == None and year_restriction == None: return matching
    group_restriction = [res.lower() for res in group_restriction]
    output = []
    for col in matching:
        details = get_var_details(col)
        if not details: continue
        if strict:
            if all({any(str(year) in d for q, d in details) for year in year_restriction}) and \
                all({any(group in d for q, d in details) for group in group_restriction}):
                output.append(col)

        else:
            if any(any(str(year) in d for q, d in details) for year in year_restriction) and \
                any(any(group in d for q, d in details) for group in group_restriction):
                output.append(col)

    return output

def get_matching_keywords_question(keywords: list[str], group_restriction: list[str]=None, year_restriction: list[int]=None, strict: bool=False) -> list[str]:
    """
    Returns the names of the columns whose question contains any of the keywords
    Can restrict to specific years by passing a list of years to year restriction
    Same for group restriction
    Strict means that a variable must be found in all years and all groups listed, non strict means it only needs to satisfy one year or one group
    """
    global data
    assert group_restriction == None or all({res.lower() in [None, "cohort", "cross"] for res in group_restriction})
    assert year_restriction == None or all({str(res) in [None, "2021", "2022", "2023"] for res in year_restriction})
    matching = [col for col in data.columns if any(keyword.lower() in "".join([str(thing[0]) for thing in get_var_details(col)]).lower() for keyword in keywords)]
    if group_restriction == None and year_restriction == None: return matching
    group_restriction = [res.lower() for res in group_restriction]
    output = []
    for col in matching:
        details = get_var_details(col)
        if not details: continue
        if strict:
            if all({any(str(year) in d for q, d in details) for year in year_restriction}) and \
                all({any(group in d for q, d in details) for group in group_restriction}):
                output.append(col)

        else:
            if any(any(str(year) in d for q, d in details) for year in year_restriction) and \
                any(any(group in d for q, d in details) for group in group_restriction):
                output.append(col)

    return output

def get_from_dataset(datasets: list[str], strict: bool=False) -> list[str]:
    """
    Get the variables who appear in the list of given datasets
    If strict, the variable must appear in all of the listed datasets
    """
    global data
    output = []
    for col in data.columns:
        details = get_var_details(col)
        if not details: continue
        if all(dd in [ds for q, ds in details] for dd in datasets): output.append(col)

    return output


def get_var_details(var_name: str) -> list[tuple[str, str]]:
    """
    Returns a list of pairs detailing the question and datasets 
    corresponding to the given variable name
    """
    global cols
    filtered_data = cols[cols['new_var'] == var_name]
    
    info = []
    result: pd.DataFrame = filtered_data[['new_var', 'text', 'dataset']]
    for index, row in result.iterrows():
        info.append((row['text'], row['dataset']))

    return info

def get_column_info(column_name: str) -> None:
    """
    Returns information about a given column
    """
    global data, cols
    column_info: pd.DataFrame = data[column_name]
    print(f"{' Information about: ' + column_name + ' ':-^70}")
    details = get_var_details(column_name)
    if not details: print(f"{' No associated question found ':-^70}")
    for question, dataset in details:
        print(f"{' Dataset: ' + dataset + ' ':-^70}")
        print(f"{' Question: ' + question + ' ':-^70}")
        print(f"{'':-^70}\n")
    print(f"Number of NA: {column_info.isna().sum()}")
    print(column_info.value_counts())
    print("")

def get_datasets(column_name: str) -> list[str]:
    """
    Returns a list of datasets a variable name is found in
    """
    datasets = []
    for question, dataset in get_var_details(column_name):
        datasets.append(dataset)
    return datasets

In [69]:
keywords = ["fear"]
to_examine = get_matching_keywords_question(keywords)
for col in to_examine:
    get_column_info(col)

 Information about: PSYCH_relational_needs_satisfaction_scale_7pt_true_self_without_rejection 
------------------------ Dataset: 2023_cohort ------------------------
 Question: Please rate your level of agreement with each of the following statements: - I can show my true self to people who are important to me without fear of rejection. 
----------------------------------------------------------------------

Number of NA: 10018
Series([], Name: count, dtype: int64)

 Information about: PSYCH_relational_needs_satisfaction_scale_5pt_true_self_without_rejection 
------------------------ Dataset: 2023_cohort ------------------------
 Question: For each of the statements below, indicate the degree to which you agree or disagree with the statement: - I can show my true self to people who are important to me without fear of rejection. 
----------------------------------------------------------------------

Number of NA: 10018
Series([], Name: count, dtype: int64)

-------- Information about: 

In [4]:
keywords = data.columns
to_examine = get_matching_keywords(keywords, group_restriction=["cross", "cohort"], year_restriction=["2022"], strict=True)
for col in to_examine:
    get_column_info(col)


---------------- Information about: ELIGIBLE_consent -----------------
------------------------ Dataset: 2021_cross -------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cohort ------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cross -------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cohort ------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cross ------------

In [13]:
to_examine = get_from_dataset(["2022_cohort", "2022_cross"])
for col in to_examine:
    get_column_info(col)

filtered_data = data[to_examine]

---------------- Information about: ELIGIBLE_consent -----------------
------------------------ Dataset: 2021_cross -------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cohort ------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cross -------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cohort ------------------------
 Question: Do you acknowledge and agree to the conditions outlined above? 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cross ------------

In [14]:
filtered_data

Unnamed: 0,ELIGIBLE_consent,COVID_prevention_distancing,COVID_prevention_masks,COVID_prevention_hand_washing,COVID_prevention_reduce_people,COVID_prevention_avoid_trips,COVID_prevention_household,COVID_vaccinated,WELLNESS_life_satisfaction,WELLNESS_malach_pines_burnout_measure_tired,...,CONNECTION_social_barriers_shy,CONNECTION_social_barriers_nervous,CONNECTION_social_barriers_work_too_much,CONNECTION_social_barriers_family_responsibilities,CONNECTION_social_barriers_nothing_to_do,CONNECTION_social_barriers_no_people,CONNECTION_social_barriers_small_living_space,CONNECTION_social_barriers_covid,CONNECTION_social_barriers_alienated,CONNECTION_social_barriers_other_text
0,Yes,Somewhat closely,Somewhat closely,Somewhat closely,Not at all,Not at all,Not at all,"Yes, three or more doses",2.0,,...,,,,,,,,,,
1,Yes,Very closely,Not at all,Very closely,Very closely,Very closely,Very closely,"Yes, three or more doses",7.0,,...,,,,,,,,,,
2,Yes,Somewhat closely,Not at all,Somewhat closely,Not at all,Not at all,Somewhat closely,"Yes, three or more doses",5.0,,...,,,,,,,,,,
3,Yes,Not at all,Not at all,Somewhat closely,Not at all,Not at all,Not at all,"Yes, three or more doses",4.0,,...,,,,,,,,,,
4,Yes,Very closely,Very closely,Very closely,Very closely,Very closely,Somewhat closely,"Yes, one dose",8.0,Rarely,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,Yes,Not at all,Not at all,Somewhat closely,Not at all,Not at all,Not at all,"Yes, two doses",5.0,,...,,,,,,,,,,
11426,Yes,Somewhat closely,Not at all,Somewhat closely,Somewhat closely,Very closely,Not at all,"Yes, three or more doses",10.0,,...,,,,,,,,,,
11427,Yes,Not at all,Not at all,Very closely,Not at all,Not at all,Not at all,"Yes, three or more doses",8.0,,...,,,,,,,,,,
11428,Yes,Somewhat closely,Somewhat closely,Very closely,Somewhat closely,Somewhat closely,Somewhat closely,"Yes, two doses",4.0,Often,...,Not Selected,Not Selected,Not Selected,I have family responsibilities,Not Selected,I don't have people to hang out with,Not Selected,Not Selected,Not Selected,


In [49]:
keywords = ["future"]
to_examine = get_matching_keywords(keywords)
for col in to_examine:
    get_column_info(col)


In [None]:
# Define the mappings for each response category
response_mapping = {
    "Disagree strongly": -3,
    "Disagree moderately": -2,
    "Disagree a little": -1,
    "Neither agree nor disagree": 0,
    "Agree a little": 1,
    "Agree moderately": 2,
    "Agree strongly": 3,
    "Disagree Strongly": -3,  # For slight variations in text
    "Agree Strongly": 3,
    "Disagree A little": -1,
}

# Map the responses to numerical values
data['PSYCH_ten_item_personality_inventory_conscientiousness_disorganized_2r_numeric'] = \
    data['PSYCH_ten_item_personality_inventory_conscientiousness_disorganized_2r'].map(response_mapping)

data['PSYCH_big_five_inventory_organized_numeric'] = \
    data['PSYCH_big_five_inventory_organized'].map(response_mapping)

# Combine the columns into a single scale
# If both columns have values, take the average; otherwise, use the non-NA value
data['aggregated_organized_scale'] = data[
    ['PSYCH_ten_item_personality_inventory_conscientiousness_disorganized_2r_numeric',
     'PSYCH_big_five_inventory_organized_numeric']
].mean(axis=1, skipna=True)

organized_data = data[["aggregated_organized_scale"]]

0        1.0
1       -3.0
2       -2.0
3       -1.0
4       -3.0
        ... 
11425    1.0
11426    1.0
11427   -0.5
11428    NaN
11430    2.0
Name: aggregated_organized_scale, Length: 10018, dtype: float64

In [45]:
keywords = ["aggregate", "LONELY_ucla_loneliness_scale_score", "scale_score"] # ["LONELY_ucla_loneliness_scale_score", "LONELY_dejong_emotional_social_loneliness_scale_score"]
to_examine = get_matching_keywords(keywords)
for col in to_examine:
    get_column_info(col)


 Information about: PSYCH_cope_60_positive_reinterpretation_and_growth_subscale_score 
-------------------- No associated question found --------------------
Number of NA: 9715
PSYCH_cope_60_positive_reinterpretation_and_growth_subscale_score
10.0    43
11.0    42
9.0     40
14.0    31
12.0    30
16.0    25
7.0     21
15.0    20
8.0     18
13.0    18
6.0      8
4.0      4
5.0      3
Name: count, dtype: int64

 Information about: PSYCH_cope_60_mental_disengagement_subscale_score 
-------------------- No associated question found --------------------
Number of NA: 9713
PSYCH_cope_60_mental_disengagement_subscale_score
9.0     58
8.0     48
10.0    40
7.0     35
12.0    30
11.0    28
13.0    22
6.0     13
5.0     10
14.0     9
15.0     7
4.0      3
16.0     2
Name: count, dtype: int64

 Information about: PSYCH_cope_60_focus_on_and_venting_of_emotions_subscale_score 
-------------------- No associated question found --------------------
Number of NA: 9713
PSYCH_cope_60_focus_on_and_ventin

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# Convert LONELY_ucla_loneliness_scale_score_y_n to binary format
data['LONELY_ucla_loneliness_scale_score_y_n_numeric'] = \
    data['LONELY_ucla_loneliness_scale_score_y_n'].map({"Yes (6-9)": 1, "No (3-5)": 0})

# Drop rows with missing values in relevant columns
filtered_data = data.dropna(subset=['LONELY_ucla_loneliness_scale_score_y_n_numeric', 'aggregated_organized_scale'])

# Fit the logistic regression model
model = smf.logit(
    formula='LONELY_ucla_loneliness_scale_score  ~ aggregated_organized_scale',
    data=filtered_data
).fit()

# Display the model summary
print(model.summary())


Optimization terminated successfully.
         Current function value: 0.690648
         Iterations 4
                                         Logit Regression Results                                         
Dep. Variable:     LONELY_ucla_loneliness_scale_score_y_n_numeric   No. Observations:                 1174
Model:                                                      Logit   Df Residuals:                     1172
Method:                                                       MLE   Df Model:                            1
Date:                                            Tue, 26 Nov 2024   Pseudo R-squ.:                0.002770
Time:                                                    19:37:48   Log-Likelihood:                -810.82
converged:                                                   True   LL-Null:                       -813.07
Covariance Type:                                        nonrobust   LLR p-value:                   0.03380
                                 coef    s

In [43]:
import statsmodels.formula.api as smf

# Drop rows with missing values in the relevant columns
filtered_data = data.dropna(subset=['LONELY_ucla_loneliness_scale_score', 'aggregated_organized_scale'])

# Fit the linear regression model
linear_model = smf.ols(
    formula='LONELY_ucla_loneliness_scale_score ~ aggregated_organized_scale',
    data=filtered_data
).fit()

# Display the model summary
print(linear_model.summary())


                                    OLS Regression Results                                    
Dep. Variable:     LONELY_ucla_loneliness_scale_score   R-squared:                       0.003
Model:                                            OLS   Adj. R-squared:                  0.002
Method:                                 Least Squares   F-statistic:                     3.007
Date:                                Tue, 26 Nov 2024   Prob (F-statistic):             0.0832
Time:                                        19:45:16   Log-Likelihood:                -2323.9
No. Observations:                                1174   AIC:                             4652.
Df Residuals:                                    1172   BIC:                             4662.
Df Model:                                           1                                         
Covariance Type:                            nonrobust                                         
                                 coef    std err  

In [None]:
import statsmodels.formula.api as smf

# Drop rows with missing values in the relevant columns
filtered_data = data.dropna(subset=['LONELY_dejong_emotional_social_loneliness_scale_score', 'aggregated_organized_scale'])

# Fit the linear regression model
linear_model = smf.ols(
    formula='LONELY_dejong_emotional_social_loneliness_scale_score ~ aggregated_organized_scale',
    data=filtered_data
).fit()

# Display the model summary
print(linear_model.summary())


                                              OLS Regression Results                                             
Dep. Variable:     LONELY_dejong_emotional_social_loneliness_scale_score   R-squared:                       0.032
Model:                                                               OLS   Adj. R-squared:                  0.032
Method:                                                    Least Squares   F-statistic:                     178.0
Date:                                                   Tue, 26 Nov 2024   Prob (F-statistic):           5.71e-40
Time:                                                           19:46:52   Log-Likelihood:                -10958.
No. Observations:                                                   5423   AIC:                         2.192e+04
Df Residuals:                                                       5421   BIC:                         2.193e+04
Df Model:                                                              1                