<a href="https://colab.research.google.com/github/wcj365/python-stats-dataviz/blob/master/misc/fall2022survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Oct 20 In-class Practice

Student survey data analysis and visualization.

In [8]:
import pandas as pd
import plotly.express as px

pd.set_option('display.max_colwidth', None)

In [3]:
DATA_SOURCE = "https://raw.githubusercontent.com/wcj365/python-stats-dataviz/master/data/DATA690_FALL2022_Survey.csv"

df = pd.read_csv(DATA_SOURCE)

df.shape

(23, 13)

In [4]:
df.head()

Unnamed: 0,Timestamp,1. My primary educational background,2. My primary job function,3. My level of computer programing knowledge,4. My level of Python progrmming knowledge,5. My level of Statistics knowledge,6. My Familarity with GitHub,7. My gender,8. My Birth Year,9. What do you enjoy doing in your leisure time?,10. What do you expect to get out of this course?,11. What is your long-term career goal?,12. Other courses taking this semester
0,2022/08/31 2:44:42 PM AST,"STEM (Science, Technology, Engineering, Math)",Technology/engineering-oriented,Intermediate,Intermediate,Intermediate,I use GitHub regularly,Female,1/11/1997,"I love doing yoga, dancing and cooking",I want to learn visualization in an efficient ...,I want to be a data scientist in companies lik...,Special Topics in Data Science | DATA 690 - Ar...
1,2022/08/31 3:10:34 PM AST,"STEM (Science, Technology, Engineering, Math)",Full-time Student,Beginner,Beginner,Beginner,I use GitHub occasionally,Female,1991,watch movie,I want to understand the way to analyze and co...,To be Data Scientist,Platforms for Big Data Processing | DATA 603;S...
2,2022/08/31 3:11:14 PM AST,"STEM (Science, Technology, Engineering, Math)",Full-time Student,Intermediate,Intermediate,Beginner,I don't use GitHub,Female,2000,Watch anime and songs,Have good command using python for data viz an...,Data engineer,"IS 603 Decision Support Systems, IS 755 Data M..."
3,2022/08/31 4:42:40 PM AST,"STEM (Science, Technology, Engineering, Math)",Technology/engineering-oriented,Beginner,Beginner,Beginner,I don't use GitHub,Male,1969,This is it - take classes.,I know a little statistics and I know a little...,My 2nd retirement!,
4,2022/08/31 4:44:08 PM AST,"STEM (Science, Technology, Engineering, Math)",Technology/engineering-oriented,Beginner,Beginner,Beginner,I don't use GitHub,Female,1999,Listening to music,To have strong grip on python programming,To see myself as a data scientist,Introduction to Data Science | DATA 601;Data M...


In [9]:
df_q12 = df[["12. Other courses taking this semester"]]
df_q12.head()

Unnamed: 0,12. Other courses taking this semester
0,"Special Topics in Data Science | DATA 690 - Artificial Intelligence Practice;ENMG 652 Management, Leadership and Communication"
1,Platforms for Big Data Processing | DATA 603;Special Topics in Data Science | DATA 690 - Data Structures & Algorithms
2,"IS 603 Decision Support Systems, IS 755 Data Mining"
3,
4,Introduction to Data Science | DATA 601;Data Management | DATA 604


## Pay attention to the difference between the two:

- df[["12. Other courses taking this semester"]]
- df["12. Other courses taking this semester"]

In [11]:
other_courses = df["12. Other courses taking this semester"].tolist()
other_courses

['Special Topics in Data Science | DATA 690 - Artificial Intelligence Practice;ENMG 652 Management, Leadership and Communication ',
 'Platforms for Big Data Processing | DATA 603;Special Topics in Data Science | DATA 690 - Data Structures & Algorithms',
 'IS 603 Decision Support Systems, IS 755 Data Mining ',
 'None',
 'Introduction to Data Science | DATA 601;Data Management | DATA 604',
 'Ethical And Legal Issues in Data Science | DATA 605;Special Topics in Data Science | DATA 690 - Financial Data Science',
 'Engineering Management 652, Cyber Security 620',
 'Introduction to Data Science | DATA 601;Data Management | DATA 604',
 'Introduction to Data Science | DATA 601;Data Management | DATA 604',
 'Introduction to Data Science | DATA 601;Ethical And Legal Issues in Data Science | DATA 605',
 'Platforms for Big Data Processing | DATA 603;Special Topics in Data Science | DATA 690 - Financial Data Science',
 'HIT 723- Public health Informatics, ENMG 650- Project management fundamentals '

In [13]:
# Initialize the course_student_count dict with empty items
# This is similar to initialize a list with empty items: a_list = []

course_student_count = {}

for course_string in other_courses:       # first loop process student one at a time

    courses = course_string.split(";")    # split the course string into a list of courses 

    for course in courses:                # second loop process courses taken by the student, one course at a time
        if course in course_student_count:  # if the course is already in the dictionary, increment the count by 1
            #course_student_count[course] = course_student_count[course] + 1
            course_student_count[course] += 1
        else:                               # if the course is not in the dictionary, add the course to the dictionary with count 1
            course_student_count[course] = 1

course_student_count


{'Special Topics in Data Science | DATA 690 - Artificial Intelligence Practice': 1,
 'ENMG 652 Management, Leadership and Communication ': 1,
 'Platforms for Big Data Processing | DATA 603': 3,
 'Special Topics in Data Science | DATA 690 - Data Structures & Algorithms': 2,
 'IS 603 Decision Support Systems, IS 755 Data Mining ': 1,
 'None': 1,
 'Introduction to Data Science | DATA 601': 9,
 'Data Management | DATA 604': 3,
 'Ethical And Legal Issues in Data Science | DATA 605': 6,
 'Special Topics in Data Science | DATA 690 - Financial Data Science': 8,
 'Engineering Management 652, Cyber Security 620': 1,
 'HIT 723- Public health Informatics, ENMG 650- Project management fundamentals ': 1,
 'ENMG - 652': 2,
 'Intro to Data Analysis and Machine Learning | DATA 602': 2,
 'DATA 690: Deep learning': 1}

In [16]:
# Create a data frame from a dictionary and specify the name for the columns
df_course_count = pd.DataFrame(course_student_count.items(), columns=["Course Name", "Students"])

df_course_count

Unnamed: 0,Course Name,Students
0,Special Topics in Data Science | DATA 690 - Artificial Intelligence Practice,1
1,"ENMG 652 Management, Leadership and Communication",1
2,Platforms for Big Data Processing | DATA 603,3
3,Special Topics in Data Science | DATA 690 - Data Structures & Algorithms,2
4,"IS 603 Decision Support Systems, IS 755 Data Mining",1
5,,1
6,Introduction to Data Science | DATA 601,9
7,Data Management | DATA 604,3
8,Ethical And Legal Issues in Data Science | DATA 605,6
9,Special Topics in Data Science | DATA 690 - Financial Data Science,8


## Data Cleansing

In [23]:
def remove_special_topic(course_name):
    if course_name.startswith("Special Topics"):
        return course_name.split("|")[1]
    else:
        return course_name

test_course = "Special Topics in Data Science | DATA 690 - Financial Data Science"
print(remove_special_topic(test_course))

test_course2 = "Intro to Data Analysis and Machine Learning | DATA 602"
print(remove_special_topic(test_course2))

 DATA 690 - Financial Data Science
Intro to Data Analysis and Machine Learning | DATA 602


In [24]:
df_course_count["cleansed_course_name"] = df_course_count["Course Name"].apply(remove_special_topic)

df_course_count

Unnamed: 0,Course Name,Students,cleansed_course_name
0,Special Topics in Data Science | DATA 690 - Artificial Intelligence Practice,1,DATA 690 - Artificial Intelligence Practice
1,"ENMG 652 Management, Leadership and Communication",1,"ENMG 652 Management, Leadership and Communication"
2,Platforms for Big Data Processing | DATA 603,3,Platforms for Big Data Processing | DATA 603
3,Special Topics in Data Science | DATA 690 - Data Structures & Algorithms,2,DATA 690 - Data Structures & Algorithms
4,"IS 603 Decision Support Systems, IS 755 Data Mining",1,"IS 603 Decision Support Systems, IS 755 Data Mining"
5,,1,
6,Introduction to Data Science | DATA 601,9,Introduction to Data Science | DATA 601
7,Data Management | DATA 604,3,Data Management | DATA 604
8,Ethical And Legal Issues in Data Science | DATA 605,6,Ethical And Legal Issues in Data Science | DATA 605
9,Special Topics in Data Science | DATA 690 - Financial Data Science,8,DATA 690 - Financial Data Science


In [27]:
df_sorted = df_course_count.sort_values(by="Students", ascending=False)

fig = px.bar(df_sorted, x="cleansed_course_name", y="Students")

fig.show()