<a href="https://colab.research.google.com/github/tsnow2010/umbc_data690_fall2024/blob/main/session_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# UMBC DATA690 Fall 2024

## Session 02

## Step 1: Load the Survey Response Data

In [None]:
import pandas as pd
from random import randint

DATA_URL = 'https://raw.githubusercontent.com/wcj365/python-stats-dataviz/master/data/UMBC%20DATA%20690%20FALL%202024%20DR%20WANG%20Student%20Survey.csv'

df = pd.read_csv(DATA_URL)

df.shape
df.head(5)
df.tail(2)
df.sample(5)

# Below gives a random sample of a field from column 10
# prev_column_list = list(df['10. What do you expect to get out of this course?'])
# print(prev_column_list[randint(0,len(prev_column_list)-1)])


A good handson experience on statistics and python programming 


## Step 2: Explore Age Distribution


In [None]:
# Access a column in a panda object like its a dictionary and then convert into a list.
yob_list = list(df['8. My Birth Year'])


In [None]:
cleansed_yob_list = [] # Empty list

for yob in yob_list:
  cleansed_yob = yob[-4:]
  cleansed_yob_list.append(cleansed_yob)

cleansed_yob_list


['2000',
 '2002',
 '1997',
 '1999',
 '2000',
 '2001',
 '2000',
 '1999',
 '1978',
 '2000',
 '1999',
 '2000',
 '1998',
 '2001',
 '2001',
 '2000',
 '2002',
 '1992',
 '2001',
 '1997',
 '1999']

In [None]:
# Now let's modify the same column and convert the years of birth to ages.

age_list = []

# Iterates through years and converts into age (CAO 2024)
for yob in cleansed_yob_list:
  age = int(2024-int(yob))
  age_list.append(age)

age_list

[24,
 22,
 27,
 25,
 24,
 23,
 24,
 25,
 46,
 24,
 25,
 24,
 26,
 23,
 23,
 24,
 22,
 32,
 23,
 27,
 25]

In [None]:
# Finds minimum age.
min(age_list)

# Finds maximum age.
max(age_list)

# Finds average
total_age = 0

for age in age_list:
  total_age = total_age + age

# Calculates and rounds average age.
round(total_age/len(age_list))


26

In [None]:
revised_age_total = 0
num_skipped = 0

for age in age_list:
  if age > 35:
    num_skipped += 1
    continue
  revised_age_total += age

print('Revised average age, minus outliers, is: ' + str(round(revised_age_total/(len(age_list)-num_skipped))))

Revised average age, minus outliers, is: 25


In [None]:
# Now, lets remove some smaller numbered outliers as well.

revised_age_total_2 = 0
num_skipped_2 = 0

for age in age_list:
  if age >= 35 or age <= 22:
    print('Age', age, 'is skipped.')
    num_skipped_2 += 1
  else:
    revised_age_total_2 += age

print('Revised average age, minus small and large outliers, is:', str(round(revised_age_total_2/(len(age_list)-num_skipped_2))))

Age 22 is skipped.
Age 46 is skipped.
Age 22 is skipped.
Revised average age, minus small and large outliers, is: 25


## Step 3: Explore Educational Background

> * Analyzing our students' educational background.

In [42]:
# Gathers educational background data from responses.
edu_list = list(df['1. My primary educational background'])

# This function is a comprehensive strip() function, removing parentheses and commas, and lowercases the word.

def strip_all(word):
  strip1 = word.strip(')')
  strip2 = strip1.strip('(')
  strip3 = strip2.strip(',')
  lower_strip = strip3.lower()
  return lower_strip

# Lists of keywords for (3) different educational disciplines.
stem_kw_list = ['stem', 'mathematics', 'software', 'engineering', 'science', 'psychology']
social_sci_kw_list = ['history', 'political', 'social']
crim_just_kw_list = ['criminal', 'justice', 'policing', 'law']

# Counts number of students in multiple disciplines using keywords
stem_stud_count = 0
social_sci_stud_count = 0
crim_just_stud_count = 0
other_stud_count = 0

for student_edu in edu_list:
  # Creates a list from the response with each element being (1) word with punctuation
  kw_list = student_edu.split()

  # Removes any commas or periods or other punctuation around keywords.
  np_kw_list = [strip_all(word) for word in kw_list]

  # for loop checks to see if a student gave keyword responses to (3) different educational backgrounds.
  for kw in np_kw_list:

    # Checks for STEM words WITHOUT social science terms.
    if kw in stem_kw_list and kw != 'political' and kw != 'social':
      stem_stud_count += 1
      break

    # Checks for social science terms
    elif kw in social_sci_kw_list:
      social_sci_stud_count += 1
      break

    # Checks for criminal justice terms
    elif kw in crim_just_kw_list:
      crim_just_stud_count += 1
      break

    # Continues with next word if word is not a "keyword" and increments other_stud count if on last word.
    elif kw is np_kw_list[len(np_kw_list)-1]:
        other_stud_count += 1
        break
    continue

# Prints results
print("STEM Students: ", stem_stud_count)
print("Social Science Students: ", social_sci_stud_count)
print("Criminal Justice Students: ", crim_just_stud_count)
print("Other Students: ", other_stud_count)
print("Total Students: ", len(edu_list))


STEM Students:  19
Social Science Students:  1
Criminal Justice Students:  1
Other Students:  0
Total Students:  21


## Step 4: Explore Job Function

> * Analyzing our students' full-time employment fields.

In [43]:
# Gathers job field data from responses.
job_list = list(df['2. My primary job function'])

# Starts counting responses per (3) different responses from job-field question.
f_t_stud_count = 0
tech_engi_count = 0
busi_count = 0

for job in job_list:
  if job == 'Full-time Student':
    f_t_stud_count += 1
  elif job == 'Technology/engineering-oriented':
    tech_engi_count += 1
  elif job == 'Business/service-oriented':
    busi_count += 1

# Prints results
print('Full-time Student Count: ', f_t_stud_count)
print('Students Working In Tech/Engineering Fields: ', tech_engi_count)
print('Students Working In Business/Service Fields: ', busi_count)
print("Total Students: ", len(job_list))



Full-time Student Count:  17
Students Working In Tech/Engineering Fields:  2
Students Working In Business/Service Fields:  2
Total Students:  21


## Step 5: Explore How Many Students Wanted 'Statistics' review

> * Using similar techniques from 'Explore Educational Background,' I explore how many students suggested that they wanted to review Statistics in this course.

In [None]:
# Gathers responses regarding what students hope to get out of the course.
why_list = list(df['10. What do you expect to get out of this course?'])

# Lists of keywords suggesting review of Statistics.
stats_kw_list = ['statistics', 'statistical', 'stats', 'statistically']
review_kw_list = ['learn', 'review', 'brush', 'basics', 'hands-on', 'experience', 'knowledge', 'understanding']

# Counts number of students wanting to review Statistics.
stat_review_stud_count = 0

for stud_why in why_list:

  # Assign flags to be True if and only if they responded suggesting a review of Statistics.
  stats_flag = False
  review_flag = False

  # Creates a list of str from the student's response with each element being (1) keyword with punctuation.
  kw_list = stud_why.split()

  # Removes any commas or periods or other punctuation around keywords.
  np_kw_list = [strip_all(word) for word in kw_list]

  # for loop checks to see if student gave keyword responses for Statistics and review.
  for kw in np_kw_list:

    # Checks if Statistics was mentioned.
    if kw in stats_kw_list:
      stats_flag = True
      continue

    # Checks for keywords suggesting a "review"
    elif kw in review_kw_list:
      review_flag = True
      continue

    # Breaks inner loop if both flags are true and increments count of students.
    if stats_flag == True and review_flag == True:
      stat_review_stud_count += 1
      break

# Prints results and calculates the rounded percentage of students wanting a review of statistics
print("Number of Students Wanting Review of Statistics: ", stat_review_stud_count)
print("Total Students: ", len(why_list))
print(str(round(float(stat_review_stud_count/len(why_list))*100)) + '%', 'of students')






Number of Students Wanting Review of Statistics:  10
Total Students:  21
48% of students
