# Data cleaner + Variable Searcher
### Add to the beginning of your notebook
**Imports the data and does the recommended cleaning. Adds 3 functions modified from variable searcher to return detailed information about variables.**
Variable names:
- cols: the list of variables
- data: the original dataframe

Functions:
- get_matching_keywords: takes a list of strings and returns all variables who contain at least one keyword
- get_var_details: takes a variable name and returns a list of questions corresponding to which dataset the variable is from
- get_column_info: takes a variable name and returns a list of every question associated with the variable as well as information about the type and amount of responses to the question.

In [1]:
import pandas as pd
cols = pd.read_csv("var_names.csv")
data = pd.read_csv('CSCS_data_anon.csv', low_memory=False,
                        na_values=["9999", "", " ", "Presented but no response", "NA"])
empty = (data.isna().sum()==data.shape[0])
data = data[empty.index[~empty]] # keep non empty columns only
data = data[data.REMOVE_case=='No'].copy()

In [2]:
def get_matching_keywords(keywords: list[str]) -> list[str]:
    """
    Returns the names of the columns whose string contains any of the keywords
    """
    global data
    return [col for col in data.columns if any(keyword.lower() in col.lower() for keyword in keywords)]

def get_var_details(var_name: str) -> list[tuple[str, str]]:
    """
    Returns a list of pairs detailing the question and datasets 
    corresponding to the given variable name
    """
    global cols
    filtered_data = cols[cols['new_var'] == var_name]
    
    info = []
    result: pd.DataFrame = filtered_data[['new_var', 'text', 'dataset']]
    for index, row in result.iterrows():
        info.append((row['text'], row['dataset']))

    return info

def get_column_info(column_name: str) -> None:
    """
    Returns information about a given column
    """
    global data, cols
    column_info: pd.DataFrame = data[column_name]
    print(f"{' Information about: ' + column_name + ' ':-^70}")
    details = get_var_details(column_name)
    if not details: print(f"{' No associated question found ':-^70}")
    for question, dataset in details:
        print(f"{' Dataset: ' + dataset + ' ':-^70}")
        print(f"{' Question: ' + question + ' ':-^70}")
        print(f"{'':-^70}\n")
    print(f"Number of NA: {column_info.isna().sum()}")
    print(column_info.value_counts())
    print("")

In [3]:
# Example usage
words = ["lonely"]
to_examine = get_matching_keywords(words)
for col in to_examine:
    get_column_info(col)

--- Information about: LONELY_ucla_loneliness_scale_companionship ----
------------------------ Dataset: 2021_cross -------------------------
 Question: Indicate how often each of the statements below is descriptive of you. - How often do you feel that you lack companionship? 
----------------------------------------------------------------------

------------------------ Dataset: 2022_cohort ------------------------
 Question: Indicate how often each of the statements below is descriptive of you. - How often do you feel that you lack companionship? 
----------------------------------------------------------------------

------------------------ Dataset: 2023_cohort ------------------------
 Question: Indicate how often each of the statements below is descriptive of you. - How often do you feel that you lack companionship? 
----------------------------------------------------------------------

Number of NA: 6764
LONELY_ucla_loneliness_scale_companionship
Some of the time    1650
Hardl