This Jupyter notebook is analyzing the survey data collected from my friends.

In [40]:
# import pandas and read in the survey data
import pandas as pd

original_data = pd.read_csv("survey_data.csv")
copy_data = original_data

In [41]:
# rename the columns
renamed_data = copy_data.rename(columns={"Timestamp":"timestamp",
                          "What is your major?":"major",
                          "What is your age?":"age",
                          "What year of college are you in?":"year_in_school",
                          "Are you (or have you) been on a sports team at Berry?":"berry_athlete",
                          "Check the boxes that reflect your traveling experience.":"travel_experience",
                          "What has been one country or state you would like to highlight from your travels?":"place_to_highlight",
                          "When you traveled, was it in the middle of semester?":"in_school",
                          "What season of the year was it when you traveled?":"season_of_year",
                          "What was the purpose of your travels?":"purpose_of_travel",
                          "Why did you travel at the time of year you did?":"why_travel",
                          "By what means did you travel?":"means_of_travel"})

Q1: What are the percentages of the demographics?

In [42]:
# get rid of extra spaces
renamed_data['major'] = copy_data['What is your major?'].str.strip()

# make all the entries lower case
renamed_data['major'] = renamed_data['major'].str.lower()

In [61]:
# make a set of all the entries
majors = set(renamed_data['major'].tolist())
len(majors)

27

In [44]:
# find 'com' entries
com_entry = renamed_data['major'].where(renamed_data['major']=='com').dropna()

# get index of 'com' entries
com_index = com_entry.index

# change value of 'com' entries to communications
renamed_data.loc[com_index,'major'] = 'communications'

In [45]:
# find 'sports admin' entries
sports_admin_entry = renamed_data['major'].where(renamed_data['major']=='sports admin').dropna()

# get index of 'sports admin' entries
sports_index = sports_admin_entry.index

# change value of 'sports admin' entries to sports administration
renamed_data.loc[sports_index,'major'] = 'sports administration'

In [46]:
# create dictionary of majors and their percentage

# get total entries
total = len(renamed_data)

# create emppty dictionary
major_dict = {}

# for a major in the set of majors
for current_major in majors:

    # set current major count to 0
    major_count = 0

    # loop through the majors in the original data
    for major_data in renamed_data['major']:

        # if the current major is equal to the major in the data
        if current_major == major_data:

            # increment major count by 1
            major_count+=1
    
    # add a new key value pair for each major in the set of majors
    major_dict[current_major] = round((major_count/total)*100,3)

# sort the dictionary based value in descending order
sorted_dict = dict(sorted(major_dict.items(),key = lambda x : x[1],reverse=True))

sorted_dict

{'exercise science': 11.364,
 'creative technologies': 6.818,
 'nursing': 6.818,
 'computer science': 6.818,
 'animal science': 6.818,
 'economics': 4.545,
 'sports administration': 4.545,
 'biology': 4.545,
 'computer science and creative technologies': 4.545,
 'accounting': 4.545,
 'secondary education and history': 2.273,
 'biochemistry': 2.273,
 'data analytics': 2.273,
 'political science': 2.273,
 'communication and environmental studies': 2.273,
 'business management': 2.273,
 'psychology': 2.273,
 'management': 2.273,
 'math and computer science': 2.273,
 'data science': 2.273,
 'faculty': 2.273,
 'finance': 2.273,
 'anthropology': 2.273,
 'elementary education': 2.273,
 'sports leadership and strategy': 2.273,
 'biology/religion': 2.273,
 'com': 0.0,
 'sports admin': 0.0}

In [47]:
# easier way to do what I did above
major_counts = round(((renamed_data['major'].value_counts())/total)*100,3)
major_counts

major
exercise science                              11.364
nursing                                        6.818
animal science                                 6.818
computer science                               6.818
creative technologies                          6.818
sports administration                          4.545
accounting                                     4.545
economics                                      4.545
computer science and creative technologies     4.545
biology                                        4.545
biochemistry                                   2.273
data science                                   2.273
data analytics                                 2.273
sports leadership and strategy                 2.273
anthropology                                   2.273
management                                     2.273
communications                                 2.273
faculty                                        2.273
communication and environmental studies 

In [48]:
# find the percentage of people of a given age
age_counts = round((renamed_data['age'].value_counts()/total)*100,3)
age_counts

age
21    43.182
20    40.909
22     6.818
19     4.545
29     2.273
18     2.273
Name: count, dtype: float64

In [60]:
# mean age of respondents
renamed_data['age'].mean()

20.681818181818183

In [49]:
# find percentage of the population in a given year of school
year_counts = round((renamed_data['year_in_school'].value_counts()/total)*100,3)
year_counts

year_in_school
Junior       72.727
Senior       13.636
Sophomore     6.818
Senior+       4.545
Freshman      2.273
Name: count, dtype: float64

In [50]:
# percentage of people who have been on an athletics team at Berry
team_counts = round((renamed_data['berry_athlete'].value_counts()/total)*100,3)
team_counts

berry_athlete
Yes    59.091
No     40.909
Name: count, dtype: float64

Q2: What percentage of users checked each reply for their travel experiences?

In [51]:
# split the entries into a list
places_traveled = renamed_data['travel_experience'].str.split(',')

# make a list that contains all the options available to the user
places = ['Asia','South America','The Caribbean','The Middle-East','Europe','Africa',
                   'Canada','Mexico','Never been outside the United States']

# create a dictionary to store the places and the number of users who selected it
places_dict = {}

# loop through the list of options
for current_place in places:

    # set current place count to 0
    place_count = 0

    # loop through all the entries from the survey
    for entry in places_traveled:

        # if an entry contains the current place
        if current_place in entry:

            # increase current place count by 1
            place_count+=1
    
    # store key,value pair in ther dictionary
    places_dict[current_place] = round((place_count/total)*100,2)

# sort the array in descending order
dict(sorted(places_dict.items(),key = lambda x : x[1],reverse=True))

{'South America': 25.0,
 'The Caribbean': 25.0,
 'Never been outside the United States': 15.91,
 'Asia': 11.36,
 'Europe': 11.36,
 'Africa': 4.55,
 'Canada': 4.55,
 'Mexico': 2.27,
 'The Middle-East': 0.0}

Q3: What is the percentage of people have been where?

In [62]:
# get rid of any extra space
renamed_data['place_to_highlight'] = renamed_data['place_to_highlight'].str.strip()

# make all entries lower case
renamed_data['place_to_highlight'] = renamed_data['place_to_highlight'].str.lower()

# get the index of weird entry
long_entry_index = renamed_data['place_to_highlight'].where(
    renamed_data['place_to_highlight']==
    'i’ve only been to dominican republic other than the us. cool place though'
).dropna().index

# change value
renamed_data.loc[(long_entry_index,'place_to_highlight')] = 'dominican republic'

canada_index = renamed_data['place_to_highlight'].where(
    renamed_data['place_to_highlight']==
    'canadá'
).dropna().index

renamed_data.loc[(canada_index,'place_to_highlight')] = 'canada'


set(renamed_data['place_to_highlight'])

place_highlighted_count = renamed_data['place_to_highlight'].value_counts()

round((place_highlighted_count/total)*100,2)

place_to_highlight
brazil                9.09
utah                  6.82
spain                 6.82
mexico                4.55
thailand              4.55
south africa          4.55
italy                 4.55
wyoming               4.55
dominican republic    4.55
norway                2.27
nigeria               2.27
jamaica               2.27
montana               2.27
switzerland           2.27
texas                 2.27
france/england        2.27
bahamas               2.27
cambodia              2.27
new york              2.27
canada                2.27
united kingdom        2.27
costa rica            2.27
peru                  2.27
california            2.27
guatemala             2.27
greece                2.27
netherlands           2.27
colorado              2.27
poland                2.27
puerto rico           2.27
honduras              2.27
Name: count, dtype: float64

In [53]:
# get count of places traveled to
len(place_highlighted_count)

31

Q4: What percentage of people traveled during the semester and what percentage of people did not?

In [54]:
# get the percentage of in school vs. out
school_counts = renamed_data['in_school'].value_counts()
round((school_counts/total)*100,2)

in_school
No     84.09
Yes    15.91
Name: count, dtype: float64

Q5: What was the percentage of people that traveled during each season?

In [55]:
# get the percentage of each season
season_counts = renamed_data['season_of_year'].value_counts()
round((season_counts/total)*100,2)

season_of_year
Summer    75.00
Winter    13.64
Spring     6.82
Fall       4.55
Name: count, dtype: float64

Q6: What was the purpose of travel?

In [56]:
# get the percentage of each purpose response
purpose_counts = renamed_data['purpose_of_travel'].value_counts()
round(purpose_counts/total*100,2)

purpose_of_travel
Vacation                                  38.64
Ministry                                  18.18
Visit Family                              13.64
Study Abroad                              11.36
Athletic Event                             6.82
Family wedding                             2.27
YoungLife                                  2.27
Wedding                                    2.27
Faculty coordinator for an abroad trip     2.27
To Go Home                                 2.27
Name: count, dtype: float64

Q7: Why did people travel when they did?

In [57]:
# get the percentage of each why response
renamed_data['why_travel'] = renamed_data['why_travel'].str.lower()
round((renamed_data['why_travel'].value_counts()/total)*100,2)

why_travel
out of school                               59.09
i had no say in when the travel occurred    18.18
better weather                               9.09
team was in-season                           4.55
class                                        2.27
that’s when the wedding was                  2.27
during covid                                 2.27
during my study abroad term                  2.27
Name: count, dtype: float64

Q7: By what means did people travel?

In [58]:
# make all entries lowercase
renamed_data['means_of_travel'] = renamed_data['means_of_travel'].str.lower()

# find 'cruise line' entry
cruise_line_index = renamed_data.where(renamed_data['means_of_travel']=='cruise line').dropna().index

# change entry to 'cruise'
renamed_data.loc[cruise_line_index,'means_of_travel'] = 'cruise'

# get the count of each means of travel
renamed_data['means_of_travel'].value_counts()

means_of_travel
airplane           39
cruise              2
automobile          2
plane and train     1
Name: count, dtype: int64