In [5]:
import pandas as pd
cols = pd.read_csv("var_names.csv")
data = pd.read_csv('CSCS_data_anon.csv', low_memory=False,
                        na_values=["9999", "", " ", "Presented but no response", "NA"])
empty = (data.isna().sum()==data.shape[0])
data = data[empty.index[~empty]] # keep non empty columns only
data_use = data[data.REMOVE_case=='No'].copy()

In [6]:
def get_matching_keywords(keywords: list[str]) -> list[str]:
    """
    Returns the names of the columns whose string contains any of the keywords
    """
    global data
    return [col for col in data.columns if any(keyword.lower() in col.lower() for keyword in keywords)]

def get_var_details(var_name: str) -> list[tuple[str, str]]:
    """
    Returns a list of pairs detailing the question and datasets 
    corresponding to the given variable name
    """
    global cols
    filtered_data = cols[cols['new_var'] == var_name]
    
    info = []
    result: pd.DataFrame = filtered_data[['new_var', 'text', 'dataset']]
    for index, row in result.iterrows():
        info.append((row['text'], row['dataset']))

    return info

def get_column_info(column_name: str) -> None:
    """
    Returns information about a given column
    """
    global data, cols
    column_info: pd.DataFrame = data[column_name]
    print(f"{' Information about: ' + column_name + ' ':-^70}")
    details = get_var_details(column_name)
    if not details: print(f"{' No associated question found ':-^70}")
    for question, dataset in details:
        print(f"{' Dataset: ' + dataset + ' ':-^70}")
        print(f"{' Question: ' + question + ' ':-^70}")
        print(f"{'':-^70}\n")
    print(f"Number of NA: {column_info.isna().sum()}")
    print(column_info.value_counts())
    print("")

In [7]:
words = ["sad"]
to_examine = get_matching_keywords(words)

for col in to_examine:
    get_column_info(col)

-- Information about: PSYCH_body_self_image_questionnaire_feel_sad ---
------------------------ Dataset: 2022_cross -------------------------
 Question: The statements below are all related to you and your body. Read each statement carefully, and decide how true the statement is for you. Use the rating scale below to indicate how true the statement is for you. - My naked body makes me feel sad. 
----------------------------------------------------------------------

Number of NA: 10661
PSYCH_body_self_image_questionnaire_feel_sad
Not at all true of myself       189
Slightly true of myself         172
About halfway true of myself    157
Completely true of myself       132
Mostly true of myself           120
Name: count, dtype: int64

---------- Information about: PSYCH_big_five_inventory_sad -----------
------------------------ Dataset: 2022_cross -------------------------
----- Question: I see myself as someone who - is sad, depressed ------
--------------------------------------------

In [9]:
data_use.PSYCH_big_five_inventory_sad.value_counts()

PSYCH_big_five_inventory_sad
Agree a little                597
Disagree Strongly             576
Agree Strongly                363
Neither agree nor disagree    354
Disagree A little             350
Name: count, dtype: int64

In [10]:
words = ["burnout"]
to_examine = get_matching_keywords(words)

for col in to_examine:
    get_column_info(col)

--- Information about: WELLNESS_malach_pines_burnout_measure_tired ---
------------------------ Dataset: 2021_cross -------------------------
 Question: When you think about your life overall, how often do you feel the following? - Tired 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cohort ------------------------
 Question: When you think about your life overall, how often do you feel the following? - Tired 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cross -------------------------
 Question: When you think about your work overall, how often do you feel the following? - Tired 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cohort ------------------------
 Question: When you think about your life overall, how often do you feel the following? - Tired 
-----------------------------------------

In [11]:
data_use.WELLNESS_maslach_burnout_burned_out.value_counts()

WELLNESS_maslach_burnout_burned_out
Never                         138
A few times a year or less    106
Once a month or less           72
Every day                      52
A few times a month            51
A few times a week             45
Once a week                    40
Name: count, dtype: int64

In [12]:
words = ["loneliness"]
to_examine = get_matching_keywords(words)

for col in to_examine:
    get_column_info(col)

--- Information about: LONELY_ucla_loneliness_scale_companionship ----
------------------------ Dataset: 2021_cross -------------------------
 Question: Indicate how often each of the statements below is descriptive of you. - How often do you feel that you lack companionship? 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cohort ------------------------
 Question: Indicate how often each of the statements below is descriptive of you. - How often do you feel that you lack companionship? 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cohort ------------------------
 Question: Indicate how often each of the statements below is descriptive of you. - How often do you feel that you lack companionship? 
----------------------------------------------------------------------

Number of NA: 7531
LONELY_ucla_loneliness_scale_companionship
Some of the time    1962
Hardl

In [14]:
data_use.LONELY_existential_loneliness_scale_score.value_counts()

LONELY_existential_loneliness_scale_score
30.0    136
32.0    108
28.0     97
31.0     94
29.0     91
34.0     90
33.0     83
27.0     71
36.0     61
35.0     58
26.0     51
25.0     51
24.0     50
38.0     49
23.0     44
22.0     42
37.0     34
40.0     31
21.0     29
20.0     28
39.0     24
19.0     24
42.0     23
18.0     23
16.0     22
17.0     22
45.0     17
15.0     16
41.0     15
14.0     15
44.0     13
12.0     12
54.0     12
46.0     10
13.0      9
43.0      8
11.0      7
6.0       6
48.0      5
47.0      4
50.0      3
52.0      3
7.0       3
51.0      3
49.0      2
10.0      2
9.0       2
8.0       2
Name: count, dtype: int64