<h1><center>Q1</center></h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

df = pd.read_csv('../Samson/salary_df_car_fut.csv')

In [2]:
#list of text and numeric column names
textcolnames = ['Company', 'Title', 'Address', 'Emp_type', 'Seniority', 'Industry', 'Responsibility', 'Requirements']
numericcolnames = ['Salary_avg']

#drop column
df.drop('Unnamed: 0', axis=1, inplace=True)

#clean 'Salary' column
df = df[df['Salary'] != 'NONE']
df['Salary_lower'] = df['Salary'].apply(lambda x: x.replace('$','').replace(',','').split('to')[0]).astype('float')
df['Salary_higher'] = df['Salary'].apply(lambda x: x.replace('$','').replace(',','').split('to')[1]).astype('float')
df['Salary_avg'] = (df['Salary_lower'] + df['Salary_higher']) /2
df['Salary_avg'] = df['Salary_avg'].apply(lambda x: round(x/12,1) if x > 30000 else x)
df.drop(['Salary', 'Salary_lower', 'Salary_higher'], axis=1, inplace=True)

#clean text columns
df[textcolnames] = df[textcolnames].apply(lambda x: x.str.lower())
df[textcolnames] = df[textcolnames].apply(lambda x: x.str.replace('[^\w\s]',' '))

#function to combine text columns for CountVec to process
def combine_text_columns(data_frame, to_drop=numericcolnames):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna("", inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

#### CREATING TARGET VARIABLE

In [3]:
#create predictor
X = combine_text_columns(df)

#create target variable
df['Salary_binary'] = df['Salary_avg'].apply(lambda x: 1 if x > 10000 else 0)
print('number of high salaried rows: {}'.format(len(df[df['Salary_binary']  == 1])))
print('number of low salaried rows: {}'.format(len(df[df['Salary_binary']  == 0])))

#create new dataframe with predictors and target
df2 = pd.DataFrame(X,columns=['all_text'])
df2['over10k'] = df['Salary_binary']

number of high salaried rows: 211
number of low salaried rows: 1091


#### CLASSIFICATION

In [4]:
#train_test_split
from sklearn.model_selection import train_test_split
y = df2['over10k']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, 
                                                    test_size=0.2, stratify=df2['over10k'])


from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

#add to stopwords list
from nltk.corpus import stopwords

stopwordslist = stopwords.words('english')
stopwordslist.extend(['requirements', 'roles', 'responsibilities', 'qualifications', 
                    'responsibilitiesresponsibilities', 'scope','requirementseducation',
                     'required', 'na', 'n a'])


#randomforestclassifier (an ensemble method that using averaging)
forest = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=stopwordslist)),
    ('clf', RandomForestClassifier())
                ])

forest.fit(X_train, y_train)
print('randomforest')
print(forest.score(X_test, y_test))
print('='*40)

#logisticregression
logistic = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=stopwordslist)),
    ('clf', LogisticRegression())
                ])

logistic.fit(X_train, y_train)
print('logistic regression')
print(logistic.score(X_test, y_test))
print('='*40)
print('base line')
print(y.value_counts() / len(y))


randomforest
0.8314176245210728
logistic regression
0.8390804597701149
base line
0    0.837942
1    0.162058
Name: over10k, dtype: float64


#### FEATURE SELECTION

In [5]:
#initialize model
forest = RandomForestClassifier()
cv = CountVectorizer(stop_words=stopwordslist)
X_1 = cv.fit_transform(X)

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_1.toarray(), df2['over10k'], random_state=1, test_size=0.2, stratify=df2['over10k'])
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

#feature importances
f_impt = forest.feature_importances_
idx = cv.get_feature_names()
df_forestfeatures = pd.DataFrame(f_impt,  index=idx, columns=['feature_impt'])
df_forestfeatures.sort_values('feature_impt', ascending=False, inplace=True) #how do I track down these 20 words? 
df_forestfeatures[:20]

Unnamed: 0,feature_impt
lead,0.012041
risk,0.009351
services,0.007561
prioritisation,0.007349
ftl,0.006422
senior,0.006029
arrange,0.005928
strategy,0.005577
drive,0.005102
thus,0.005008


In [6]:
#function to convert each column feature into a list of words
def featurewordlist(df, col_name):
    
    sentences = []
    for x in df[col_name]:
        sentences.append(x)

    words = []
    for sentence in sentences:
        words.extend(sentence.split())

    return list(set(words))

#creating the list of words for each feature
company = featurewordlist(df, 'Company')
title = featurewordlist(df, 'Title')
address = featurewordlist(df, 'Address')
emp_type = featurewordlist(df, 'Emp_type')
seniority = featurewordlist(df, 'Seniority')
industry = featurewordlist(df, 'Industry')
responsibility = featurewordlist(df, 'Responsibility')
requirements =featurewordlist(df, 'Requirements')

#creating empty lists to do a word count 
company_count = [] 
title_count = []
address_count = []
emp_type_count = []
seniority_count = []
industry_count = []
responsibility_count = []
requirements_count = []

countdict = {'company': company_count, 
             'title': title_count, 
             'address': address_count, 
             'emp_type': emp_type_count, 
             'seniority': seniority_count, 
             'industry': industry_count, 
             'responsibility': responsibility_count, 
             'requirements': requirements_count}

#extract only the top 20 word features
top20feats = df_forestfeatures['feature_impt'][:20].index

#loop through to print word feature and the feature column it belongs to
for x in top20feats:
    print (x)
    if x in company:
        print ('list is company')
        company_count.append(x)
    elif x in title:
        print ('list is title')
        title_count.append(x)
    elif x in address:
        print ('list is address')
        address_count.append(x)
    elif x in emp_type:
        print ('list is emp_type')
        emp_type_count.append(x)
    elif x in seniority:
        print ('list is seniority')
        seniority_count.append(x)
    elif x in industry:
        print ('list is industry')
        industry_count.append(x)
    elif x in responsibility:
        print ('list is responsibility')
        responsibility_count.append(x)
    elif x in requirements:
        print ('list is requirements')
        requirements_count.append(x)
    else:
        print ('word not in list, how can that be?')
    print ('='*20)

lead
list is title
risk
list is company
services
list is company
prioritisation
list is responsibility
ftl
list is responsibility
senior
list is title
arrange
list is responsibility
strategy
list is company
drive
list is address
thus
list is responsibility
wealth
list is title
cloud
list is title
approximately
list is responsibility
scoring
list is title
including
list is responsibility
believe
list is responsibility
click
list is responsibility
practice
list is title
years
list is responsibility
analytics
list is company


In [7]:
for key, value in countdict.items():
    #print value
    print(key.upper() + ':', len([item for item in value]))

COMPANY: 4
TITLE: 6
ADDRESS: 1
EMP_TYPE: 0
SENIORITY: 0
INDUSTRY: 0
RESPONSIBILITY: 9
REQUIREMENTS: 0


#### N-GRAMS

In [8]:
#initialize model
forest = RandomForestClassifier()
cv = CountVectorizer(stop_words=stopwordslist, ngram_range=(1,2))
X = cv.fit_transform(X).toarray()
y = df2['over10k']

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=df2['over10k'])
forest.fit(X_train, y_train)
print (forest.score(X_test, y_test))

#feature importances
f_impt = forest.feature_importances_
idx = cv.get_feature_names()
df_forestfeatures = pd.DataFrame(f_impt,  index=idx, columns=['feature_impt'])
df_forestfeatures.sort_values('feature_impt', ascending=False, inplace=True) #how do I track down these 20 words? 
df_forestfeatures[:20]

0.8544061302681992


Unnamed: 0,feature_impt
senior management,0.013077
business product,0.005581
grow committed,0.005158
geographies,0.005154
banking,0.005053
performance reporting,0.004571
assigned time,0.004555
effectiveness,0.004507
liquidity risk,0.004244
energy environments,0.004244


#### RANDOMIZED GRID SEARCH

In [9]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'bootstrap': [True, False],
 'max_depth': [20, 40, 60, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 1000, 1800]}

rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 2, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

print(rf_random.best_params_)

rf_random.best_estimator_.score(X_test, y_test)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True, total=  52.1s
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True, total=  54.7s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False 
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False 
[

[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.1min remaining:  1.1min


[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False, total= 1.2min
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False, total=  44.6s
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False, total=  45.7s


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.8min finished


{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}


0.842911877394636

#### BAGGING

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
rf = RandomForestClassifier()

bagging = BaggingClassifier(base_estimator = rf, max_samples=0.5, max_features=0.5)

print ("RF Score:\t", cross_val_score(rf, X, y, cv=3, n_jobs=-1).mean())
print ("Bagging Score:\t", cross_val_score(bagging, X, y, cv=3, n_jobs=-1).mean())

RF Score:	 0.8387143952812247
Bagging Score:	 0.8387179428523801


<h1><center>Q2</center></h1>

#### ADMIN

In [11]:
#deciding which column to use as the target variable
df = df.drop('Salary_binary', axis=1)
print('{} unique values in Seniority columns.'.format(len(df['Seniority'].value_counts())))
print('{} times less unique values than Title column'.format(927/float(48)))

48 unique values in Seniority columns.
19.3125 times less unique values than Title column


#### DATA CLEANING 

In [12]:
#check
filtered = df.groupby('Seniority')['Seniority'].filter(lambda x: len(x) > 3)
df2 = df[df['Seniority'].isin(filtered)]

#data cleaning on the column of interest
df['Seniority'] = df['Seniority'].apply(lambda x: x.strip().replace('fresh', 'entry').replace('level', '').replace('manager', 'management'))
df['Seniority'] = df['Seniority'].apply(lambda x: x.split())

#function to remove duplicates
def unique_list(listwithduplicates):
    ulist = []
    [ulist.append(x) for x in listwithduplicates if x not in ulist]
    return ulist

df['Seniority'] = df['Seniority'].apply(unique_list)
df['Seniority'] = df['Seniority'].apply(lambda x: ' '.join(x).replace('management', 'executive').split())
df['Seniority'] = df['Seniority'].apply(unique_list)
df['Seniority'] = df['Seniority'].apply(lambda x: sorted(x))
df['Seniority'] = df['Seniority'].apply(lambda x: ' '.join(x))
df = df[df['Seniority'] != 'none']

#create target column
df['non_exec'] = df['Seniority'].apply(lambda x: 0 if 'exec' in x else 1)

#grouping my columns into numeric or text
numericcolnames = ['Salary_avg']
textcolnames.remove('Seniority')
targetcolname = ['non_exec']
nontargetcolnames = numericcolnames + textcolnames

#### CLASSIFICATION

In [13]:
#creating my predictor and target columns 
cv = CountVectorizer(stop_words=stopwordslist)
cv.fit(combine_text_columns(df[textcolnames]))

text = pd.DataFrame(cv.fit_transform(combine_text_columns(df[textcolnames])).toarray(), columns=cv.get_feature_names())

text['Salary_avg'] = df[numericcolnames]
text['non_exec'] = df[targetcolname]
text = text[np.isfinite(text['Salary_avg'])]

X = text.iloc[:,:-1]
y = text.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=text.iloc[:,-1])

#random forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print('RANDOM FOREST:')
print(rf.score(X_test, y_test))

#logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
print('='*20)
print('LOGISTIC REGRESSION:')
print(lr.score(X_test, y_test))
print('='*20)
print('base line')
print(y.value_counts() / len(y))



#input top 20 features into a dataframe
featsimpt = pd.DataFrame(rf.feature_importances_ , index=X_train.columns, columns=['feats_impt'])
featsimpt.sort_values('feats_impt', ascending=False, inplace=True)
feats20 = featsimpt[:20]
feats20

RANDOM FOREST:
0.7246376811594203
LOGISTIC REGRESSION:
0.6763285024154589
base line
0.0    0.738372
1.0    0.261628
Name: non_exec, dtype: float64


Unnamed: 0,feats_impt
Salary_avg,0.042059
roi,0.005297
functional,0.0047
travelling,0.004504
europe,0.004004
state,0.003839
screen,0.003753
data,0.003547
11,0.003467
conduct,0.003456


In [14]:
company_count = [] 
title_count = []
address_count = []
emp_type_count = []
seniority_count = []
industry_count = []
responsibility_count = []
requirements_count = []

countdict = {'company': company_count, 
             'title': title_count, 
             'address': address_count, 
             'emp_type': emp_type_count, 
             'seniority': seniority_count, 
             'industry': industry_count, 
             'responsibility': responsibility_count, 
             'requirements': requirements_count}

feats19 = feats20[1:].index

for x in feats19:
    print (x)
    if x in company:
        print ('appears in feature company')
        company_count.append(x)
    elif x in title:
        print ('appears in feature title')
        title_count.append(x)
    elif x in address:
        print ('appears in feature address')
        address_count.append(x)
    elif x in emp_type:
        print ('appears in feature emp_type')
        emp_type_count.append(x)
    elif x in seniority:
        print ('appears in feature seniority')
        seniority_count.append(x)
    elif x in industry:
        print ('appears in feature industry')
        industry_count.append(x)
    elif x in responsibility:
        print ('appears in feature responsibility')
        responsibility_count.append(x)
    elif x in requirements:
        print ('appears in feature requirements')
        requirements_count.append(x)
    else:
        print ('word not in list, how can that be?')
    print ('='*20)

roi
appears in feature responsibility
functional
appears in feature title
travelling
appears in feature responsibility
europe
appears in feature company
state
appears in feature company
screen
appears in feature responsibility
data
appears in feature company
11
appears in feature title
conduct
appears in feature title
parallel
appears in feature responsibility
playing
appears in feature responsibility
postgres
appears in feature responsibility
commercially
appears in feature responsibility
chips
appears in feature responsibility
development
appears in feature title
jee
appears in feature requirements
follows
appears in feature responsibility
game
appears in feature responsibility
realization
appears in feature responsibility


In [19]:
for key, value in countdict.items():
    #print value
    print(key, len([item for item in value if item]))

company 3
title 4
address 0
emp_type 0
seniority 0
industry 0
responsibility 11
requirements 1


#### N-GRAMS

In [16]:
#initialize model
forest = RandomForestClassifier()
cv = CountVectorizer(stop_words=stopwordslist, ngram_range=(1,2))

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=y)
forest.fit(X_train, y_train)
print (forest.score(X_test, y_test))

0.6956521739130435


#### RANDOMIZED SEARCH  CV

In [17]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'bootstrap': [True, False],
 'max_depth': [20, 40, 60, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 1000, 1800]}

rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 2, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

print(rf_random.best_params_)

rf_random.best_estimator_.score(X_test, y_test)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True, total=   1.8s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True, total=   1.9s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False 
[

[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    2.6s remaining:    2.6s


[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False, total=   2.3s
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False, total=   1.7s
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=60, bootstrap=False, total=   1.7s


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    4.3s finished


{'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}


0.7391304347826086

#### OTHER THINGS I WILL NEED TO TRY

1. SVM
2. TFID VECTORIZOR
3. FEATURE ENGINEERING
4. BOOSTING TECHNIQUES

#### LINKS

1. https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
2. https://towardsdatascience.com/ensemble-methods-in-machine-learning-what-are-they-and-why-use-them-68ec3f9fef5f