### Load necessary packages and read the pre-processed data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

In [None]:
df = pd.read_pickle('job_with_title.pkl')

In [None]:
df.head()

In [None]:
X = df['description']
y = df['title_id']

In [None]:

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(X)
X.shape

### Model selection

In [None]:

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
# sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=6, jitter=True, edgecolor="gray", linewidth=2)
plt.figure(figsize=(5,25))

In [None]:
cv_df.groupby('model_name').accuracy.mean()


### Linear SVC Model Evaluation

#### Model optimization

In [None]:

title_id_df = df[['search_title', 'title_id']].drop_duplicates().sort_values('title_id')
title_to_id = dict(title_id_df.values)

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.3, random_state=0)

In [None]:
title_id_df

In [None]:
title_to_id

In [None]:
model = LinearSVC()
model.fit(X_train, y_train)

#### Confusion matrix

In [None]:
conf_mat = confusion_matrix(y_test, model.predict(X_test))
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=title_id_df['search_title'].values, yticklabels=title_id_df['search_title'].values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

#### Most related words for each job title.

In [None]:
model.fit(X, y)

N = 2
for search_title, title_id in sorted(title_to_id.items()):
    indices = np.argsort(model.coef_[title_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]

    print("# {}:".format(search_title))
    print("   ## Most correlated unigrams: {}".format(',   '.join(unigrams)))
    print("   ## Most correlated bigrams: {}".format(',   '.join(bigrams)))


In [None]:

joblib.dump(model, 'indeed_LinearSVC.pkl')
joblib.dump(tfidf, 'tfidfVectorizer.pkl')

In [None]:
job_post = ["""
Data Scientist
SAVE

Home Depot
Houston, TX
Apply on Glassdoor
Apply on LinkedInApply on The Home Depot CareersApply on ZipRecruiterApply on The Home Depot Jobs | Jobs At Home DepotApply on Science Jobs
18 days agoFull-time
Position Description:

POSITION PURPOSE

A Data Scientist leverages their technical abilities to synthesize complex analytical tasks into easily understood data-driven stories. Data Scientists are responsible for organizing, analyzing, and then sharing insights gleaned from data. This positions develops predictive systems and algorithms for identifying trends and driving business solutions. This position also utilizes industry-leading standards for working with very large datasets to extract meaningful business information using statistics, machine learning, and heuristics. This position operates with minimal supervision, and once given general assignments, prioritizes and executes tasks.

MAJOR TASKS, RESPONSIBILITES AND KEY ACCOUNTABILITIES

20% - Designs and develops algorithms and models that use large datasets to derive business insights 20% - Establishes scalable, efficient processes for large scale data analyses, model development, and model implementation 20% - Ensures the quality of work output by displaying a keen attention to detail 20% - Presents findings in easily understood ways, focuses on how the data analytics fits into the bigger picture 10% - Mentors and develops the technical skills of Analysts and Sr. Analysts 10% - Seeks further knowledge on key developments within data science, technical skill sets, and additional data sources within Home Depot

NATURE AND SCOPE

This position reports to Manager or Sr Manager.

This position has 0 direct reports.

ENVIRONMENTAL JOB REQUIREMENTS

Environment:

Located in a comfortable indoor area. Any unpleasant conditions would be infrequent and not objectionable.

Travel:

Typically requires overnight travel less than 10% of the time.

Additional Environmental Job Requirements:

ESSENTIAL SKILLS:

MINIMUM QUALIFICATIONS

Must be eighteen years of age or older.

Must be legally permitted to work in the United States.

Additional Minimum Qualifications:

Education Required:

The knowledge, skills and abilities typically acquired through the completion of a bachelor's degree program or equivalent degree in a field of study related to the job.

Years of Relevant Work Experience: 4 years

Physical Requirements:

Most of the time is spent sitting in a comfortable position and there is frequent opportunity to move about. On rare occasions there may be a need to move or lift light articles.

Additional Qualifications:

Preferred Qualifications:

Masters Degree in Computer Science, Math, Engineering, or related quantitative field and 3+ years experience in position offered or related position

Knowledge, Skills, Abilities and Competencies:- Extensive experience in a hands on analytical role, with focus on Root Cause Analysis and Strong relational database skills (Access, SQL Server, Postgres, etc.) and SQL skills (writing complex queries to pull large sets of data, performing analysis using SQL queries) Ability to process large amounts of data through high throughput computing tools (HTCondor, Hadoop, etc.) Strong knowledge in statistical analysis and model building using software (R, SAS, etc.) Experience in data visualization and building dashboards (Tableau, R, Excel, etc.) Strong understanding of Object Oriented Programming Language (C++, Java, Python, etc.) Proficiency in Excel (Pivot Tables, V-Lookup, Macros, VBA, etc.) a must

We are an Equal Opportunity Employer and do not discriminate against any employee or applicant for employment because of race, color, sex, age, national origin, religion, sexual orientation, gender identity, status as a veteran, and basis of disability or any other federal, state or local protected class
"""]

In [None]:
model = joblib.load('indeed_LinearSVC.pkl')
tfidf = joblib.load('tfidfVectorizer.pkl')

In [None]:
model.predict(tfidf.transform(job_post))