# ECE 143 Final Project

## Preprocess the data
### This includes removing punctuation, seperating columns accordingly, and converting column types to relative data types

In [454]:
#import pandas 
import pandas as pd
import numpy as np

In [455]:
#define necessary functions for clean up
def get_course_number(course):
    course_num = course.split("-")
    return course_num[0]

def get_course_description(course):
    course_description = course.split("-")
    final_course_description = course_description[1].split("(")
    return final_course_description[0]

def grade_cleaner(grade):
    temp =  grade.split("(")
    temp_grade = temp[1]
    temp_grade_2 = temp_grade.split(")")
    final_grade = temp_grade_2[0]
    return final_grade

def percent_cleaner(percent):
    temp = percent.split(" ")
    return temp[0]

def get_department(name):
    department = name.split(" ")
    return department[0]
    

In [456]:
#check dataframe size before clean up
df = pd.read_csv("143Data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5230 entries, 0 to 5229
Data columns (total 10 columns):
Instructor            5229 non-null object
Course                5230 non-null object
Term                  5230 non-null object
Enroll                5230 non-null int64
Evals Made            5230 non-null int64
Rcmnd Class           5230 non-null object
Rcmnd Instr           5230 non-null object
Study Hrs/wk          5230 non-null float64
Avg Grade Expected    5072 non-null object
Avg Grade Received    4067 non-null object
dtypes: float64(1), int64(2), object(7)
memory usage: 408.7+ KB


In [457]:
#read the dataframe, apply cleanup to data and convert data accordingly
df.dropna(inplace = True)
df["Course Number"] = df["Course"].apply(get_course_number)
df["Course Description"] = df["Course"].apply(get_course_description)
df["Avg Grade Received"] = df["Avg Grade Received"].apply(grade_cleaner)
df["Avg Grade Expected"] = df["Avg Grade Expected"].apply(grade_cleaner)
df["Rcmnd Class"] = df["Rcmnd Class"].apply(percent_cleaner)
df["Rcmnd Instr"] = df["Rcmnd Instr"].apply(percent_cleaner)
df["Department"] = df["Course Number"].apply(get_department)
df["Rcmnd Class"] = df["Rcmnd Class"].astype(float)
df["Rcmnd Instr"] = df["Rcmnd Instr"].astype(float)
df["Avg Grade Received"] = df["Avg Grade Received"].astype(float)
df["Avg Grade Expected"] = df["Avg Grade Expected"].astype(float)
df["Term"] = df["Term"].astype("category")
df["Department"] = df["Department"].astype("category")

In [458]:
# get dataframe info after cleanup
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4045 entries, 2 to 5228
Data columns (total 13 columns):
Instructor            4045 non-null object
Course                4045 non-null object
Term                  4045 non-null category
Enroll                4045 non-null int64
Evals Made            4045 non-null int64
Rcmnd Class           4045 non-null float64
Rcmnd Instr           4045 non-null float64
Study Hrs/wk          4045 non-null float64
Avg Grade Expected    4045 non-null float64
Avg Grade Received    4045 non-null float64
Course Number         4045 non-null object
Course Description    4045 non-null object
Department            4045 non-null category
dtypes: category(2), float64(5), int64(2), object(4)
memory usage: 390.2+ KB


In [459]:
#display dataframe
df.head(2)

Unnamed: 0,Instructor,Course,Term,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received,Course Number,Course Description,Department
2,"Abi Samra, Nicholas Camill",ECE 180 - Topics in ECE (A),S219,29,16,100.0,100.0,4.23,3.8,3.48,ECE 180,Topics in ECE,ECE
3,"Abi Samra, Nicholas Camill",ECE 180 - Topics in ECE (A),S119,30,11,100.0,100.0,6.14,3.91,3.53,ECE 180,Topics in ECE,ECE


In [460]:
#rearrange the columns in the dataframe and drop unnecssary columns

In [461]:
df.drop(columns="Course", inplace=True)

In [462]:
df[["Instructor", "Course Number", "Course Description", "Term", "Enroll", "Evals Made", "Rcmnd Class", "Rcmnd Instr", "Study Hrs/wk", "Avg Grade Expected", "Avg Grade Received"]]
df.index = np.arange(0, len(df))


In [463]:
df.head(10)

Unnamed: 0,Instructor,Term,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received,Course Number,Course Description,Department
0,"Abi Samra, Nicholas Camill",S219,29,16,100.0,100.0,4.23,3.8,3.48,ECE 180,Topics in ECE,ECE
1,"Abi Samra, Nicholas Camill",S119,30,11,100.0,100.0,6.14,3.91,3.53,ECE 180,Topics in ECE,ECE
2,"Baghdadchi, Saharnaz",SP19,113,99,83.3,84.2,6.99,3.14,2.87,ECE 101,Linear Systems Fundamentals,ECE
3,"Kuzum, Duygu",SP19,53,21,94.7,100.0,6.82,3.26,2.89,ECE 103,Fundamntls/Devices & Materials,ECE
4,"Fullerton, Eric E",SP19,104,34,80.0,60.0,10.1,3.0,2.8,ECE 107,Electromagnetism,ECE
5,"Zeger, Kenneth A.",SP19,163,54,57.4,76.6,7.39,2.67,2.61,ECE 109,Engineering Probability&Stats,ECE
6,"Quest, Kevin B",SP19,59,19,94.4,94.4,3.94,3.39,3.36,ECE 120,Solar System Physics,ECE
7,"Sievenpiper, Daniel F.",SP19,37,15,100.0,100.0,6.81,3.75,3.16,ECE 123,Antenna Systems Engineering,ECE
8,"Esmaili, Gholamreza",SP19,26,10,100.0,100.0,9.83,3.22,3.22,ECE 125B,Intro to Power Electronics II,ECE
9,"Gessner, Richard K.",SP19,39,27,66.7,80.8,9.17,3.44,3.52,ECE 140B,The Art of Product Eng II,ECE


## Process the data - step1
### In this row we combine all the rows with the same professor, term, and course number to essentially combine all the sections together

In [532]:
#df.set_index(keys=["Instructor", "Course Number", "Term"], inplace=True)

In [533]:
df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received,Course Description,Department
Instructor,Course Number,Term,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Abi Samra, Nicholas Camill",ECE 180,S219,29,16,100.0,100.0,4.23,3.8,3.48,Topics in ECE,ECE
"Abi Samra, Nicholas Camill",ECE 180,S119,30,11,100.0,100.0,6.14,3.91,3.53,Topics in ECE,ECE
"Baghdadchi, Saharnaz",ECE 101,SP19,113,99,83.3,84.2,6.99,3.14,2.87,Linear Systems Fundamentals,ECE


In [534]:
#df.set_index(keys=["Instructor", "Course Number", "Term"], inplace=True)

In [535]:
#df[df.duplicated()]

In [536]:
selected = df.groupby(["Department", "Course Number", "Instructor", "Term"])

In [537]:
new = selected.agg({"Enroll" : "sum",
                    "Evals Made": "sum",
                    "Rcmnd Class": "mean",
                    "Rcmnd Instr" : "mean",
                    "Study Hrs/wk": "mean",
                    "Avg Grade Expected": "mean",
                    "Avg Grade Received": "mean"
                  })
new.to_csv("Preprocess_Completed.csv")