In [194]:
import polars as pl
import pandas  as pd
import os

# Why use relative addresses?

In this notebook, we will illustrate 

1. That relative addresses for loading data files works, but
2. Using absolute addresses for loading data files *will not*.

## Problem 1 - Load the `lat_long_example.csv` file using a relative address.

**Tasks.**
1. Open a terminal, start `nu`, and navigate to the root menu of your first/primary data repository,
2. Use `ls **/*` to get the relative address of `lat_long_example.csv`, and
3. Use `polars` to load and inspect these data using this relative path.

In [32]:
relative_path = "./data/lat_long_examples.csv"

In [33]:
print(os.getcwd())
(lat_lng_example :=
 pl.read_csv(relative_path)
)

/mnt/c/users/tmich/Desktop/my_repos/activities/health survey/health_survey_dsci_326


City 1,Lat 1,Long 1,City 2,Lat 2,Long 2,Distance from Web (km)
str,f64,f64,str,f64,f64,f64
"""Winona, MN""",44.050556,-91.66833,"""Ames, IA""",42.018056,-93.62,276.48
"""Glagow, Scotland, UK""",55.861111,-4.25,"""Ames, IA""",42.018056,-93.62,6237.63


## Problem 2 - Load the `lat_long_example.csv` file using a absolute address.

**Tasks.**
1. Open a terminal, start `nu`, and navigate to the root menu of *one of your first/primary data repository,
2. Use `glob **/*` to get the absolute address of `lat_long_example.csv`, and
3. Use `polars` to load and inspect these data.

In [43]:
absolute_path = "/mnt/c/Users/tmich/Desktop/my_repos/activities/health_survey/health_survey_dsci_326/data/lat_long_examples.csv"

In [44]:
(lat_lng_example :=
 pl.read_csv(absolute_path)
)

City 1,Lat 1,Long 1,City 2,Lat 2,Long 2,Distance from Web (km)
str,f64,f64,str,f64,f64,f64
"""Winona, MN""",44.050556,-91.66833,"""Ames, IA""",42.018056,-93.62,276.48
"""Glagow, Scotland, UK""",55.861111,-4.25,"""Ames, IA""",42.018056,-93.62,6237.63


## Illustrating the problem with absolute addresses

While the relative address in problem 1 points to the data IN THIS COPY of the repo, the absolute address points to the data in EXACTLY one of the copies of the repository. This becomes a problem if (A) anything changes in that repository, or (B) we are working on a different machine.

**Tasks.** To illustrate why this is a problem, do the following.

1. From your first/primary repository commit and push this notebook to GitHub,
2. Fetch and pull this notebook to another local copy of the repository,
3. In your file explorer (Files or Finder), move your first/primary repository into another folder, e.g., make a new folder and drag-and-drop the repo.
4. Rerun the cells in each local copy of the repository and document your findings in the WORD document. 

In [37]:
#why  use relative references?
#answer:Absolute refgerences are miserable to work with.
#I would much rather refer within the folder that can change locations freely, than to need to constantly change all my hard coded addresses.

In [129]:
#healthcare assignment
# import data 

rv = pd.read_csv('data/ReverseCodingItems.csv')
hd = pd.read_csv('data/health_survey.csv')

In [130]:
# do the pivot

# Melt the DataFrame
hd_stack = hd.melt(id_vars=["id"], value_vars=['F1',	'F1.1',	'F1.2',	'F1.3',	'F1.4',	'F1.5',	'F1.6',	'F1.7',	'F2',	'F2.1',	'F2.10',	'F2.11',	'F2.2',	'F2.3',	'F2.4',	'F2.5',	'F2.6',	'F2.7',	'F2.8',	'F2.9',	'F3',	'F3.1',	'F3.2',	'F3.3',	'F3.4',	'F3.5',	'F4',	'F4.1',	'F4.2',	'F4.3',	'F4.4',	'F5',	'F5.1',	'F5.2',	'F5.3',	'F5.4',	'F5.5',	'F5.6',	'F5.7',	'F6',	'F6.1',	'F6.2',	'F6.3',	'F6.4'
])
print(hd_stack.head( 20))

    id variable                       value
0    1       F1              Somewhat Agree
1    2       F1              Somewhat Agree
2    3       F1              Strongly Agree
3    4       F1              Somewhat Agree
4    5       F1              Strongly Agree
5    6       F1              Strongly Agree
6    7       F1              Strongly Agree
7    8       F1              Strongly Agree
8    9       F1              Strongly Agree
9   10       F1              Strongly Agree
10  11       F1              Somewhat Agree
11  12       F1              Strongly Agree
12  13       F1              Somewhat Agree
13  14       F1              Somewhat Agree
14  15       F1              Somewhat Agree
15  16       F1  Neither Agree nor Disagree
16  17       F1              Somewhat Agree
17  18       F1              Somewhat Agree
18  19       F1              Somewhat Agree
19  20       F1  Neither Agree nor Disagree


In [131]:
# join in the reverse table. using one to many join.

inner_joined_df = pd.merge(hd_stack, rv, how='inner', left_on ='variable',right_on = 'Column Name')

#do a select to only grab the columns we need

joined_select = inner_joined_df[['id', 'variable','value','Needs Reverse Coding?' ]]

#test = joined_select[joined_select['variable'] == 'F5'] #decided to do a test to ensure that the reverse coding came in right.


print(joined_select.head(10))


   id variable           value Needs Reverse Coding?
0   1       F1  Somewhat Agree                    No
1   2       F1  Somewhat Agree                    No
2   3       F1  Strongly Agree                    No
3   4       F1  Somewhat Agree                    No
4   5       F1  Strongly Agree                    No
5   6       F1  Strongly Agree                    No
6   7       F1  Strongly Agree                    No
7   8       F1  Strongly Agree                    No
8   9       F1  Strongly Agree                    No
9  10       F1  Strongly Agree                    No


In [132]:
#  handle the reverse coding

def assign_numbers(row):
    #1
    if row['value'] == 'Strongly Disagree' and row['Needs Reverse Coding?'] == 'No':
        return 1
    elif row['value'] == 'Strongly Disagree' and row['Needs Reverse Coding?'] == 'Yes':
        return 5
    elif row['value'] == 'Strongly Disagree' and row['Needs Reverse Coding?'] == None:
        return 1

    #2    
    elif row['value'] == 'Somewhat Disagree' and   row['Needs Reverse Coding?'] == 'No':
        return 2
    elif row['value'] == 'Somewhat Disagree' and row['Needs Reverse Coding?'] == 'Yes':
        return 4
    elif row['value'] == 'Somewhat Disagree' and   row['Needs Reverse Coding?'] == None:
        return 2

    #3
    elif row['value'] == 'Neither Agree nor Disagree':
        return 3

    #4
    elif row['value'] == 'Somewhat Agree' and row['Needs Reverse Coding?'] == 'No':
        return 4
    elif row['value'] == 'Somewhat Agree' and row['Needs Reverse Coding?'] == 'Yes':
        return 2
    elif row['value'] == 'Somewhat Agree' and row['Needs Reverse Coding?'] == None:
        return 4

    #5
    elif row['value'] == 'Strongly Agree' and row['Needs Reverse Coding?'] == 'No':
        return 5
    elif row['value'] == 'Strongly Agree' and row['Needs Reverse Coding?'] == 'Yes' :
        return 1
    elif row['value'] == 'Strongly Agree' and row['Needs Reverse Coding?'] == None:
        return 5






joined_select['TempReverseValue'] = joined_select.apply(assign_numbers, axis=1)
print(joined_select)

        id variable                       value Needs Reverse Coding?  \
0        1       F1              Somewhat Agree                    No   
1        2       F1              Somewhat Agree                    No   
2        3       F1              Strongly Agree                    No   
3        4       F1              Somewhat Agree                    No   
4        5       F1              Strongly Agree                    No   
...    ...      ...                         ...                   ...   
11611  260     F6.4  Neither Agree nor Disagree                   Yes   
11612  261     F6.4                         NaN                   Yes   
11613  262     F6.4  Neither Agree nor Disagree                   Yes   
11614  263     F6.4           Somewhat Disagree                   Yes   
11615  264     F6.4  Neither Agree nor Disagree                   Yes   

       TempReverseValue  
0                   4.0  
1                   4.0  
2                   5.0  
3                  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined_select['TempReverseValue'] = joined_select.apply(assign_numbers, axis=1)


In [133]:
def assign_numbers(row):# yes i did make the temp unreversed column here after I already did the version that included the reversals XD
     #trying to keep you on your tows ;)
    #1
    if row['value'] == 'Strongly Disagree':
        return 1

    #2    
    elif row['value'] == 'Somewhat Disagree':
        return 2

    #3
    elif row['value'] == 'Neither Agree nor Disagree':
        return 3

    #4
    elif row['value'] == 'Somewhat Agree':
        return 4

    #5
    elif row['value'] == 'Strongly Agree':
        return 5

joined_select['TempCoded'] = joined_select.apply(assign_numbers, axis=1)
print(joined_select)

        id variable                       value Needs Reverse Coding?  \
0        1       F1              Somewhat Agree                    No   
1        2       F1              Somewhat Agree                    No   
2        3       F1              Strongly Agree                    No   
3        4       F1              Somewhat Agree                    No   
4        5       F1              Strongly Agree                    No   
...    ...      ...                         ...                   ...   
11611  260     F6.4  Neither Agree nor Disagree                   Yes   
11612  261     F6.4                         NaN                   Yes   
11613  262     F6.4  Neither Agree nor Disagree                   Yes   
11614  263     F6.4           Somewhat Disagree                   Yes   
11615  264     F6.4  Neither Agree nor Disagree                   Yes   

       TempReverseValue  TempCoded  
0                   4.0        4.0  
1                   4.0        4.0  
2           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined_select['TempCoded'] = joined_select.apply(assign_numbers, axis=1)


In [140]:
#I've already used my logic above,  so decided for part E i'd just create the value.
joined_select['recoded_value'] = joined_select['TempReverseValue']
print(joined_select)


        id variable                       value Needs Reverse Coding?  \
0        1       F1              Somewhat Agree                    No   
1        2       F1              Somewhat Agree                    No   
2        3       F1              Strongly Agree                    No   
3        4       F1              Somewhat Agree                    No   
4        5       F1              Strongly Agree                    No   
...    ...      ...                         ...                   ...   
11611  260     F6.4  Neither Agree nor Disagree                   Yes   
11612  261     F6.4                         NaN                   Yes   
11613  262     F6.4  Neither Agree nor Disagree                   Yes   
11614  263     F6.4           Somewhat Disagree                   Yes   
11615  264     F6.4  Neither Agree nor Disagree                   Yes   

       TempReverseValue  TempCoded  recoded_value  
0                   4.0        4.0            4.0  
1                  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined_select['recoded_value'] = joined_select['TempReverseValue']


In [183]:
# recode for question types and select.
joined_select['question_type'] = joined_select['variable'].str.slice(0,2)

joined_pre_final = joined_select [['id' , 'question_type','recoded_value' ]]


In [209]:
#conduct aggregation

grouped = joined_pre_final.groupby(["question_type", "id"])["recoded_value"].sum().reset_index(name="total_value")


In [210]:
print(grouped)


     question_type   id  total_value
0               F1    1         31.0
1               F1    2         31.0
2               F1    3         36.0
3               F1    4         32.0
4               F1    5         37.0
...            ...  ...          ...
1579            F6  260         22.0
1580            F6  261         15.0
1581            F6  262         19.0
1582            F6  263         21.0
1583            F6  264         17.0

[1584 rows x 3 columns]


In [211]:
# final pivot long.

# Pivot the DataFrame
pivoted_finale = grouped.pivot(
    index="id",
    columns = "question_type",
    values="total_value"
)

print(pivoted_finale)


question_type    F1    F2    F3    F4    F5    F6
id                                               
1              31.0  48.0  20.0  17.0  28.0  18.0
2              31.0  47.0  19.0  17.0  27.0  20.0
3              36.0  46.0  19.0  18.0  32.0  17.0
4              32.0  54.0  12.0  15.0  30.0  16.0
5              37.0  47.0  22.0  19.0  36.0  19.0
..              ...   ...   ...   ...   ...   ...
260            32.0  50.0  22.0  18.0  34.0  22.0
261            31.0  48.0  24.0  19.0  28.0  15.0
262            30.0  47.0  20.0  23.0  33.0  19.0
263            38.0  50.0  20.0  19.0  31.0  21.0
264            33.0  53.0  16.0  21.0  33.0  17.0

[264 rows x 6 columns]


In [214]:
pivoted_finale.to_csv('data/health_survey_summary.csv',index = False)