# Importing the Dataset

In [4]:
# importing the dataset as a pandas dataframe
import pandas as pd
df = pd.read_csv('task.csv')

print(df.shape)
print(df.columns)
df.head(3)

(80, 14)
Index(['Timestamp', 'Name', 'Year in School', 'Major',
       'Second Major (if applicable)', 'Minor (if applicable)',
       'Second Minor (if applicable)', 'GPA', 'Which team interests you?',
       'Why does this team interest you?',
       'How much time can you commit per week?',
       'What value will you bring to Quant?',
       'What do you hope to get out of Quant?',
       '1 - no, 2 - maybe, 3 - yes, 4 - intern'],
      dtype='object')


Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern"
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,Quantitative Research,I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,Software Development,I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0


# Exploratory Data Analysis


In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
df.nunique(axis=0)


Timestamp                                 80
Name                                      79
Year in School                             6
Major                                     36
Second Major (if applicable)               8
Minor (if applicable)                     15
Second Minor (if applicable)               5
GPA                                       38
Which team interests you?                 12
Why does this team interest you?          80
How much time can you commit per week?    63
What value will you bring to Quant?       80
What do you hope to get out of Quant?     80
1 - no, 2 - maybe, 3 - yes, 4 - intern     4
dtype: int64

In [6]:
df.describe()

Unnamed: 0,"1 - no, 2 - maybe, 3 - yes, 4 - intern"
count,60.0
mean,2.583333
std,0.787437
min,1.0
25%,2.0
50%,3.0
75%,3.0
max,4.0


only one column is numeric data, so we must convert the others as necessary

## GPA Column

In [13]:
# convert gpa column to floats


df['GPA'] = pd.to_numeric(df['GPA'],errors = 'coerce')

df2 = df.dropna(subset=['GPA'])
print((abs(len(df2)-len(df)))/len(df)*100) # percentage of missing values in dataset, 




18.75


In [15]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern"
GPA,1.0,0.553592
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.553592,1.0


Since 18.75% is a reasonable margin for error we can perform some sort of imputation to the NaN values. 


In [16]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imp = imp.fit(df[['GPA']])
df['GPA'] = imp.transform(df[['GPA']]).ravel()
df2 = df.dropna(subset= ['GPA'])
print((abs(len(df2)-len(df)))/len(df)*100) # percentage of missing values in dataset, 
# Goal is to check that all nan values have been replaced by the mean value in the datset. 


NameError: name 'np' is not defined

Note: Another possible way to do imputation is K-nearest neighbours but it seems more intuitive to use only the gpa column's data for imputation as we don't have any variables that are clearly correlated with GPA. 

## Hours Available

In [17]:
df['hours']  = df['How much time can you commit per week?'] #making column easier to work with having such a wordy name is annoying
df = df.assign(hours = lambda x: x['hours'].str.extract('(\d+)')) # using regex to scrape out first number from the range/ single number each person gave. 
print(df['hours'])


0      5
1     10
2     10
3     10
4      3
      ..
75    15
76     2
77     5
78     6
79    10
Name: hours, Length: 80, dtype: object


In [19]:
# since dtype is still object, we should convert it to numerical data again for ease of use. 
df['hours'] = pd.to_numeric(df['hours'],errors = 'coerce')
df['hours']


0      5.0
1     10.0
2     10.0
3     10.0
4      3.0
      ... 
75    15.0
76     2.0
77     5.0
78     6.0
79    10.0
Name: hours, Length: 80, dtype: float64

In [20]:
df2 = df.dropna(subset=['hours'])
print((abs(len(df2)-len(df)))/len(df)*100) # percent of hours values that are na. 

3.75


As we can see only 3.75% of the values for the hours are NaN, therefore the column is very usable and should probably be imputed. 