# Importing the Dataset

In [38]:
# importing the dataset as a pandas dataframe
import pandas as pd
df = pd.read_csv('task.csv')

print(df.shape)
print(df.columns)
df.head(3)

(80, 14)
Index(['Timestamp', 'Name', 'Year in School', 'Major',
       'Second Major (if applicable)', 'Minor (if applicable)',
       'Second Minor (if applicable)', 'GPA', 'Which team interests you?',
       'Why does this team interest you?',
       'How much time can you commit per week?',
       'What value will you bring to Quant?',
       'What do you hope to get out of Quant?',
       '1 - no, 2 - maybe, 3 - yes, 4 - intern'],
      dtype='object')


Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern"
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,Quantitative Research,I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,Software Development,I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0


# Exploratory Data Analysis and Data Cleaning



In [39]:
import matplotlib.pyplot as plt
import seaborn as sns
df.nunique(axis=0)


Timestamp                                 80
Name                                      79
Year in School                             6
Major                                     36
Second Major (if applicable)               8
Minor (if applicable)                     15
Second Minor (if applicable)               5
GPA                                       38
Which team interests you?                 12
Why does this team interest you?          80
How much time can you commit per week?    63
What value will you bring to Quant?       80
What do you hope to get out of Quant?     80
1 - no, 2 - maybe, 3 - yes, 4 - intern     4
dtype: int64

In [40]:
df.describe()

Unnamed: 0,"1 - no, 2 - maybe, 3 - yes, 4 - intern"
count,60.0
mean,2.583333
std,0.787437
min,1.0
25%,2.0
50%,3.0
75%,3.0
max,4.0


only one column is numeric data, so we must convert the others as necessary

## GPA Column

In [41]:
# convert gpa column to floats


df['GPA'] = pd.to_numeric(df['GPA'],errors = 'coerce')

df2 = df.dropna(subset=['GPA'])
print((abs(len(df2)-len(df)))/len(df)*100) # percentage of missing values in dataset, 




18.75


In [42]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern"
GPA,1.0,0.553592
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.553592,1.0


Since 18.75% is a reasonable margin for error we can perform some sort of imputation to the NaN values. 


In [43]:
from sklearn.impute import SimpleImputer
import numpy as np
imp = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imp = imp.fit(df[['GPA']])
df['GPA'] = imp.transform(df[['GPA']]).ravel()
df2 = df.dropna(subset= ['GPA'])
print((abs(len(df2)-len(df)))/len(df)*100) # percentage of missing values in dataset, 
# Goal is to check that all nan values have been replaced by the mean value in the datset. 


0.0


Note: Another possible way to do imputation is K-nearest neighbours but it seems more intuitive to use only the gpa column's data for imputation as we don't have any variables that are clearly correlated with GPA. 

## Hours Available

In [44]:
df['hours']  = df['How much time can you commit per week?'] #making column easier to work with having such a wordy name is annoying
df = df.assign(hours = lambda x: x['hours'].str.extract('(\d+)')) # using regex to scrape out first number from the range/ single number each person gave. 
print(df['hours'])


0      5
1     10
2     10
3     10
4      3
      ..
75    15
76     2
77     5
78     6
79    10
Name: hours, Length: 80, dtype: object


In [45]:
# since dtype is still object, we should convert it to numerical data again for ease of use. 
df['hours'] = pd.to_numeric(df['hours'],errors = 'coerce')
df['hours']


0      5.0
1     10.0
2     10.0
3     10.0
4      3.0
      ... 
75    15.0
76     2.0
77     5.0
78     6.0
79    10.0
Name: hours, Length: 80, dtype: float64

In [46]:
df2 = df.dropna(subset=['hours'])
print((abs(len(df2)-len(df)))/len(df)*100) # percent of hours values that are na. 

3.75


As we can see only 3.75% of the values for the hours are NaN, therefore the column is very usable and should probably be imputed.

In [47]:
imp = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imp = imp.fit(df[['hours']])
df['hours'] = imp.transform(df[['hours']]).ravel()
df2 = df.dropna(subset= ['hours'])
print((abs(len(df2)-len(df)))/len(df)*100) # percentage of missing values in dataset, 
# Goal is to check that all nan values have been replaced by the mean value in the datset. 

0.0


Above code is very similar to the procedure for the gpa dataset, replaced all NaN values with the average GPA in the dataset.

In [48]:
df.drop(columns = ['How much time can you commit per week?']) # irrelevant now that we have the hours column

Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.00,Quantitative Research,I am interested in both Quant Research and Sof...,I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.00,Software Development,I'm interested in both the Software Developmen...,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"Software Development, Strategy Implementation,...",Interested in drawing insights from markets.,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,Quantitative Research,I want to pursue a career in the field of quan...,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,12/20/2021 18:10:45,Jose Jones,Sophomore,Finance in Agribusiness,,,,3.10,"Strategy Implementation, Quantitative Research...",I want to be a part of something bigger than m...,"I will bring my knowledge of NFTs, the stock m...",I hope to get a lot of experience and work wit...,,15.0
76,12/20/2021 21:38:28,Elizabeth Wolfe,Sophomore,Finance,Information Systems,,,4.00,"Software Development, Strategy Implementation,...","I have been interested in tech, and investing ...",Diversity of thought is something I feel like ...,I would love to work with a team of another br...,,2.0
77,12/21/2021 0:26:20,Colleen Gordon,Master's,Information science,,,,3.90,Strategy Implementation,My experience in data analytics can be helpful...,Honesty\nDiscipline\nCommitment,Real world experience working on projects,,5.0
78,12/21/2021 1:39:35,Rita Vega,Sophomore,Computer Science,,Statistics,Hopefully Technology and management,3.86,Software Development,This team interests me because I am very inter...,I have work experience as a software developme...,I want more exposure to Quantitative finance. ...,,6.0


In [49]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours
GPA,1.0,0.512905,-0.198078
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.512905,1.0,0.020963
hours,-0.198078,0.020963,1.0


Now that we've handled most of our numerical data and can see that there is some correlation between the GPA and the score of the candidate, we'll try and analyze some of the text based data. 


### Text Based Data

In [50]:
df['word count'] = (df['Why does this team interest you?'] + df['What value will you bring to Quant?'] + df['What do you hope to get out of Quant?']).apply(lambda x: len(str(x).split(" ")))
df.head()

Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,Quantitative Research,I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0,122
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,Software Development,I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0,166
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0,241
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"Software Development, Strategy Implementation,...",Interested in drawing insights from markets.,10 hours,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0,46
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,Quantitative Research,I want to pursue a career in the field of quan...,3–5 hours,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0,68


In [51]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count
GPA,1.0,0.512905,-0.198078,0.101102
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.512905,1.0,0.020963,0.339824
hours,-0.198078,0.020963,1.0,-0.122676
word count,0.101102,0.339824,-0.122676,1.0


In [52]:
df['char count'] = (df['Why does this team interest you?'] + df['What value will you bring to Quant?'] + df['What do you hope to get out of Quant?']).str.len()
df.head()

Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,Quantitative Research,I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0,122,790
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,Software Development,I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0,166,1081
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0,241,1460
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"Software Development, Strategy Implementation,...",Interested in drawing insights from markets.,10 hours,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0,46,303
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,Quantitative Research,I want to pursue a career in the field of quan...,3–5 hours,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0,68,384


In [53]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count
GPA,1.0,0.512905,-0.198078,0.101102,0.110002
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.512905,1.0,0.020963,0.339824,0.335874
hours,-0.198078,0.020963,1.0,-0.122676,-0.113463
word count,0.101102,0.339824,-0.122676,1.0,0.994536
char count,0.110002,0.335874,-0.113463,0.994536,1.0


In [54]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))
df['avg_word'] = (df['Why does this team interest you?'] + df['What value will you bring to Quant?'] + df['What do you hope to get out of Quant?']).apply(lambda x: avg_word(x))
df.head()

Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,Quantitative Research,I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0,122,790,5.483607
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,Software Development,I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0,166,1081,5.518072
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0,241,1460,5.062241
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"Software Development, Strategy Implementation,...",Interested in drawing insights from markets.,10 hours,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0,46,303,5.608696
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,Quantitative Research,I want to pursue a career in the field of quan...,3–5 hours,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0,68,384,4.661765


In [55]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word
GPA,1.0,0.512905,-0.198078,0.101102,0.110002,0.134431
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.512905,1.0,0.020963,0.339824,0.335874,-0.141008
hours,-0.198078,0.020963,1.0,-0.122676,-0.113463,0.030919
word count,0.101102,0.339824,-0.122676,1.0,0.994536,-0.117322
char count,0.110002,0.335874,-0.113463,0.994536,1.0,-0.042036
avg_word,0.134431,-0.141008,0.030919,-0.117322,-0.042036,1.0


In [56]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['stopwords'] = (df['Why does this team interest you?'] + df['What value will you bring to Quant?'] + df['What do you hope to get out of Quant?']).apply(lambda x: len([x for x in x.split() if x in stop]))
df.head()

Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word,stopwords
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,Quantitative Research,I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0,122,790,5.483607,48
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,Software Development,I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0,166,1081,5.518072,68
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,Software Development,The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0,241,1460,5.062241,104
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"Software Development, Strategy Implementation,...",Interested in drawing insights from markets.,10 hours,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0,46,303,5.608696,18
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,Quantitative Research,I want to pursue a career in the field of quan...,3–5 hours,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0,68,384,4.661765,28


In [57]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word,stopwords
GPA,1.0,0.512905,-0.198078,0.101102,0.110002,0.134431,0.092039
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.512905,1.0,0.020963,0.339824,0.335874,-0.141008,0.324417
hours,-0.198078,0.020963,1.0,-0.122676,-0.113463,0.030919,-0.130189
word count,0.101102,0.339824,-0.122676,1.0,0.994536,-0.117322,0.992611
char count,0.110002,0.335874,-0.113463,0.994536,1.0,-0.042036,0.980561
avg_word,0.134431,-0.141008,0.030919,-0.117322,-0.042036,1.0,-0.163788
stopwords,0.092039,0.324417,-0.130189,0.992611,0.980561,-0.163788,1.0


In [58]:
df['Which team interests you?'] = df['Which team interests you?'].str.split(", ")

In [60]:
def job_type(value):
    l = []
    if 'Quantitative Research' in value:
        l.append(1)
    if 'Software Development' in value:
        l.append(2)
    if 'Strategy Implementation' in value:
        l.append(3)
    if 'Business' in value:
        l.append(4)
    return l


df['team numbers'] = (df['Which team interests you?']).apply(lambda x: job_type(x))
df.head()


Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,How much time can you commit per week?,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word,stopwords,team numbers
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,[Quantitative Research],I am interested in both Quant Research and Sof...,"5 to 10 hours per week, but up to 20 depending...",I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0,122,790,5.483607,48,[1]
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,[Software Development],I'm interested in both the Software Developmen...,10-15 hours,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0,166,1081,5.518072,68,[2]
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,[Software Development],The Software Development team interests me bec...,10-15 hours,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0,241,1460,5.062241,104,[2]
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"[Software Development, Strategy Implementation...",Interested in drawing insights from markets.,10 hours,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0,46,303,5.608696,18,"[1, 2, 3]"
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,[Quantitative Research],I want to pursue a career in the field of quan...,3–5 hours,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0,68,384,4.661765,28,[1]


In [61]:
def class_type(value):
    if value == 'Freshman':
        return 1
    if value == 'Sophomore':
        return 2
    if value == 'Junior':
        return 3
    if value == 'Senior':
        return 4
    return 0
df['class value'] = df['Year in School'].apply(lambda x: class_type(x))
df.head()

Unnamed: 0,Timestamp,Name,Year in School,Major,Second Major (if applicable),Minor (if applicable),Second Minor (if applicable),GPA,Which team interests you?,Why does this team interest you?,...,What value will you bring to Quant?,What do you hope to get out of Quant?,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word,stopwords,team numbers,class value
0,12/2/2021 17:41:15,Jessica Smith,Freshman,Computer Science,Mathematics,Urban Planning,,4.0,[Quantitative Research],I am interested in both Quant Research and Sof...,...,I have extensive experience working on enginee...,It is extremely difficult to learn about quant...,3.0,5.0,122,790,5.483607,48,[1],1
1,12/2/2021 17:51:17,Teresa Aguilar,Freshman,Computer Science,,Mathematics,,4.0,[Software Development],I'm interested in both the Software Developmen...,...,The main value I'd bring to quant would be fro...,I'm hoping to gain more experience with quanti...,3.0,10.0,166,1081,5.518072,68,[2],1
2,12/2/2021 18:00:50,Angela Miranda,Sophomore,Mathematics & Computer Science,,,,3.91,[Software Development],The Software Development team interests me bec...,...,Aside from taking most of the core CS classes ...,I'd like to learn more about system programmin...,4.0,10.0,241,1460,5.062241,104,[2],2
3,12/2/2021 19:00:07,Andre Mccormick,Junior,Statistics & Computer Science,,Mathematics,,3.74,"[Software Development, Strategy Implementation...",Interested in drawing insights from markets.,...,My technical skills and background will allow ...,"Learn new skills and most of all, develop a ne...",1.0,10.0,46,303,5.608696,18,"[1, 2, 3]",3
4,12/2/2021 21:23:32,Marcus Gilbert,Junior,Statistics,,,,3.92,[Quantitative Research],I want to pursue a career in the field of quan...,...,I believe my knowledge in the field of Statist...,Some project experience and collaboration skills.,2.0,3.0,68,384,4.661765,28,[1],3


In [62]:
df.corr()

Unnamed: 0,GPA,"1 - no, 2 - maybe, 3 - yes, 4 - intern",hours,word count,char count,avg_word,stopwords,class value
GPA,1.0,0.512905,-0.198078,0.101102,0.110002,0.134431,0.092039,-0.1568
"1 - no, 2 - maybe, 3 - yes, 4 - intern",0.512905,1.0,0.020963,0.339824,0.335874,-0.141008,0.324417,-0.169477
hours,-0.198078,0.020963,1.0,-0.122676,-0.113463,0.030919,-0.130189,-0.183342
word count,0.101102,0.339824,-0.122676,1.0,0.994536,-0.117322,0.992611,-0.183683
char count,0.110002,0.335874,-0.113463,0.994536,1.0,-0.042036,0.980561,-0.203594
avg_word,0.134431,-0.141008,0.030919,-0.117322,-0.042036,1.0,-0.163788,-0.141155
stopwords,0.092039,0.324417,-0.130189,0.992611,0.980561,-0.163788,1.0,-0.166921
class value,-0.1568,-0.169477,-0.183342,-0.183683,-0.203594,-0.141155,-0.166921,1.0
