In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy import stats

import re
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet') 


from nltk.corpus import stopwords
st = set(stopwords.words('english'))

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA

In [4]:
def lemmatize(bio):
    
    stop_words = set(stopwords.words("english"))
    new_words = ["using", "show", "result", "large", "also", "one", "two", "new", "previously", "shown", 'math']
    stop_words = stop_words.union(new_words)

    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', bio)
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # remove periods
    text = text.replace('.', '').replace(',' , '')
    
    #Convert to list from string
    text = text.split()
    
    ##Stemming
    stemmer=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    #text = [stemmer.stem(lem.lemmatize(word)) for word in text if not word in stop_words]
    text = [lem.lemmatize(word) for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [6]:
lemmatize('ok, then')

'ok'

In [7]:
experience_list = ['experience teaching tutoring', 'experience tutoring student', 'experience working student', 
                   'many student', 'many year', 
                   'year experience teaching', 'year experience tutoring',
                   'year experience working',
                   'year teaching experience', 'year tutoring experience']
                   
welcoming_list = ['look forward hearing', 'look forward helping', 'look forward meeting', 'look forward working',
                  'forward working', 'hello name', 'hi name', 'please contact', 'please feel',
                  'would like', 'would love', 'feel free contact', 'feel free reach', 'free contact question']
                  
goal_list = ['goal help student', 'achieve academic goal',  
             'help student succeed', 'helping student achieve']

passion_list = ['enjoy helping student', 'enjoy working student', 'look forward', 
                 'believe every student', 'love help', 'love teaching', 
                 'love helping student', 'love working student', 'would love help']

experience_array = []
welcoming_array = []
goal_array = []
passion_array = []
for b in corpus:
    experience_count = 0
    welcoming_count = 0
    goal_count = 0
    passion_count = 0
    if any(x in b for x in experience_list):
        experience_count += 1
    if any(x in b for x in welcoming_list):
        welcoming_count += 1
    if any(x in b for x in goal_list):
        goal_count += 1
    if any(x in b for x in passion_list):
        passion_count += 1
    experience_array.append(experience_count)
    welcoming_array.append(welcoming_count)
    goal_array.append(goal_count)
    passion_array.append(passion_count)
df_eng['experience_kw'] = np.array(experience_array)
df_eng['welcoming_kw'] = np.array(welcoming_array)
df_eng['goal_kw'] = np.array(goal_array)
df_eng['passion_kw'] = np.array(passion_array)

In [11]:
popular_subjects = ['Prealgebra',
 'Algebra 1',
 'Geometry',
 'Algebra 2',
 'Elementary Math',
 'Trigonometry',
 'Precalculus',
 'SAT Math',
 'ACT Math',
 'Grammar',
 'Vocabulary',
 'Calculus',
 'Elementary Science',
 'GED',
 'Reading',
 'English',
 'Proofreading',
 'Probability',
 'GRE',
 'Spelling',
 'Writing',
 'PSAT',
 'ACT Science',
 'ACT English',
 'Biology',
 'Physical Science',
 'Microsoft Word',
 'Statistics',
 'Microsoft Excel',
 'SAT Reading',
 'Chemistry',
 'SAT Writing',
 'ACT Reading',
 'Physics',
 'American History']

full_subjects_list = ['Prealgebra',
 'Algebra 1',
 'Geometry',
 'Algebra 2',
 'Elementary Math',
 'Trigonometry',
 'Precalculus',
 'SAT Math',
 'ACT Math',
 'Grammar',
 'Vocabulary',
 'Calculus',
 'Elementary Science',
 'GED',
 'Reading',
 'English',
 'Proofreading',
 'Probability',
 'GRE',
 'Spelling',
 'Writing',
 'PSAT',
 'ACT Science',
 'ACT English',
 'Biology',
 'Physical Science',
 'Microsoft Word',
 'Statistics',
 'Microsoft Excel',
 'SAT Reading',
 'Chemistry',
 'SAT Writing',
 'ACT Reading',
 'Physics',
 'American History',
 'Microsoft PowerPoint',
 'ASVAB',
 'Literature',
 'General Computer',
 'ESL/ESOL',
 'Geography',
 'TOEFL',
 'GMAT',
 'World History',
 'SSAT',
 'Anatomy',
 'Psychology',
 'Government & Politics',
 'Spanish',
 'European History',
 'Physiology',
 'Study Skills',
 'Astronomy',
 'Microeconomics',
 'Ecology',
 'HTML',
 'Macroeconomics',
 'Philosophy',
 'Java',
 'Music Theory',
 'Microbiology',
 'Biochemistry',
 'Linear Algebra',
 'Social Studies',
 'Phonics',
 'Financial Accounting',
 'Managerial Accounting',
 'Finance',
 'Differential Equations',
 'Geology',
 'Computer Programming',
 'SQL',
 'Python',
 'LSAT',
 'SPSS',
 'French',
 'Handwriting',
 'Nursing',
 'Political Science',
 'Adobe Photoshop',
 'JavaScript',
 'Art History',
 'German',
 'C',
 'Law',
 'Latin',
 'Adobe Illustrator',
 'R',
 'Art Theory',
 'Chinese',
 'Italian',
 'Japanese',
 'Portuguese',
 'NCLEX',
 'Discrete Math',
 'Romanian',
 'Elementary (K-6th)',
 'Organic Chemistry',
 'TAKS',
 'Business',
 'Public Speaking',
 'Special Needs',
 'Biostatistics',
 'Macintosh',
 'Praxis',
 'Russian',
 'Computer Science',
 'Track & Field']

In [None]:
popular_list = []
mid_list = []
unpopular_list = []
for f in df_eng[feature]:
    num_popular = 0
    num_mid = 0
    num_unpopular = 0
    for j in f:
        if j in popular_subjects:
            num_popular += 1
        elif j in mid_subjects:
            num_mid += 1
        elif j in unpopular_subjects:
            num_unpopular += 1
    popular_list.append(num_popular)
    mid_list.append(num_mid)
    unpopular_list.append(num_unpopular)
    
df_eng['num_popular_subjects'] = np.array(popular_list)
df_eng['num_mid_subjects'] = np.array(mid_list)
df_eng['num_unpopular_subjects'] = np.array(unpopular_list)

# engineer edu degree categories

undergrad = ['BA', 'B.A.', 'BS', 'B.S.', '']
postgrad = ['MS', 'M.S', 'MA', 'M.A', 'masters', 'Masters', 'CA', 'C.A', 
            'MBA', 'M.B.A', 'MD', 'M.D', 'PhD', 'Ph.D', 'Ph.D.', 'Graduate']
certified = ['education', 'Ced', 'certified']

undergrad_degree = []
postgrad_degree = []
certified_degree = []
for e in df_eng['edu']:
    undergrad_count = 0
    postgrad_count = 0
    certified_count = 0
    if any(x in e for x in undergrad):
        undergrad_count += 1
    if any(x in e for x in postgrad):
        postgrad_count += 1
    if any(x in e for x in certified):
        certified_count +=1
    undergrad_degree.append(undergrad_count)
    postgrad_degree.append(postgrad_count)
    certified_degree.append(certified_count)
df_eng['undergrad_degree'] = np.array(undergrad_degree)
df_eng['postgrad_degree'] = np.array(postgrad_degree)
df_eng['certified_degree'] = np.array(certified_degree)

# get university rankings
df_wur = pd.read_csv('world-university-rankings_cwurData.csv')
df_wur_select = df_wur[['institution', 'world_rank']]

list_of_top_schools = (list(df_wur_select['institution'][:30]) + 
['Berkeley', 'Caltech', 'Harvard', 'Yale', 'Princeton', 'MIT', 'Stanford'])

top_school = []
for e in df_eng['edu']:
    top = 0
    if any(x in e for x in list_of_top_schools):
        top += 1
    top_school.append(top)
df_eng['top_school'] = np.array(top_school)

df_eng['bio_count'] = df_eng['bio'].apply(lambda x: len(str(x).split(" ")))
df_eng['desc_count'] = df_eng['descriptions'].apply(lambda x: len(str(x).split(" ")))