In [1]:
from sklearn.datasets import load_files
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import progressbar
import warnings
import numpy as np
import pickle


warnings.filterwarnings("ignore")

In [2]:
dataset = load_files('Dataset/new_descriptions', shuffle=False, encoding='utf-8')

In [3]:
dataset.keys()

dict_keys(['data', 'filenames', 'target', 'target_names', 'DESCR'])

In [5]:
dataset['target_names']

['Bank_Of_America',
 'Bealls',
 'EBS_Minds_IT',
 'ICON_Technologies',
 'Lorhan',
 'Nordstrom']

In [6]:
data = pd.DataFrame({'data': dataset['data'], 'target': dataset['target']})
pd.set_option('display.max_rows', data.shape[0])
pd.set_option('display.max_columns', data.shape[1])
data


Unnamed: 0,data,target
0,Job Description:,0
1,"Responsible for developing, enhancing, modifyi...",0
2,"Work very closely with the Trading Desk, Quant...",0
3,Essential Skill Requirements:,0
4,Strong Python skills preferred. Will consider ...,0
5,"Rates business knowledge (risk, pricing, yield...",0
6,Experience of Interest Rate Derivatives and bo...,0
7,"Excellent communication skills, and good atten...",0
8,Ability to work on large scale IT projects wit...,0
9,Grab the opportunity to achieve your full pote...,0


In [7]:
data[data['target'] == 2].data[44]

'help organizations move faster, lower IT costs, and scale applications'

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data,
    dataset.target,
    test_size=0.0001, # For Overfitting
    #test_size=0.15,
    random_state=42
)

In [22]:
Counter(y_train), Counter(y_test)

(Counter({0: 49, 1: 20, 2: 29, 3: 19, 4: 35}), Counter({2: 1}))

In [10]:
X_train

['Dairy Milk chocolate, the Creme Egg and Roses selection box',
 'manages and franchises a broad portfolio of hotels and related lodging facilities',
 'free the joy',
 'Here for you during that midlife crisis',
 'Dairy Milk chocolate',
 'The company builds tools that make collaborating and writing software easier, and enable powerful collaboration, code review, and code management for open source and private projects.',
 'Yahoo! Inc. is a global Internet brand. To users, the company provides owned and operated online properties and services (Yahoo! Properties, Offerings, or Owned and Operated sites). Yahoo! also extends its marketing platform and access to Internet users beyond Yahoo! Properties through its distribution network of third-party entities who have integrated its advertising offerings into their Websites or their other offerings. To advertisers and publishers, it provides a range of marketing solutions and tools that enable businesses to reach users who visit Yahoo! Propert

In [36]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [37]:
x = vect.transform([X_train[0]])
x

<1x1621 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [39]:
[(i, x[0, i]) for i in range(1621) if x[0, i]]

[(84, 1),
 (175, 1),
 (229, 1),
 (338, 1),
 (356, 1),
 (460, 1),
 (904, 1),
 (1270, 1),
 (1302, 1),
 (1455, 1)]

In [40]:
features = vect.get_feature_names()
features

['000',
 '100',
 '127',
 '147',
 '159',
 '160',
 '175',
 '1946',
 '1990s',
 '200',
 '25',
 '3d',
 '50',
 '500',
 '5th',
 '60',
 '601',
 '62',
 '68',
 '70',
 '90',
 'ability',
 'able',
 'about',
 'absolute',
 'accelerates',
 'access',
 'accessed',
 'accessibl',
 'accessible',
 'accessories',
 'accident',
 'accordingly',
 'account',
 'accounts',
 'accuracy',
 'accuweather',
 'across',
 'acting',
 'actions',
 'activating',
 'activities',
 'activity',
 'adapt',
 'addition',
 'address',
 'addressing',
 'adept',
 'adjusts',
 'administration',
 'administrator',
 'adobe',
 'adoption',
 'advances',
 'advantage',
 'advertisers',
 'advertising',
 'advises',
 'advising',
 'advocate',
 'aerodynamic',
 'affiliate',
 'after',
 'age',
 'agency',
 'agent',
 'agents',
 'agile',
 'agreements',
 'aims',
 'aircraft',
 'all',
 'allianz',
 'allowing',
 'allows',
 'alone',
 'also',
 'amazon',
 'american',
 'an',
 'analysis',
 'analyst',
 'analytics',
 'analyze',
 'and',
 'andrex',
 'angular',
 'annuities',
 '

In [41]:
[(features[i], x[0, i]) for i in range(x.shape[1]) if x[0, i]]

[('and', 1),
 ('box', 1),
 ('chocolate', 1),
 ('creme', 1),
 ('dairy', 1),
 ('egg', 1),
 ('milk', 1),
 ('roses', 1),
 ('selection', 1),
 ('the', 1)]

In [274]:
clfs = [
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(max_depth = 2, random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0),
]

In [275]:
vect = CountVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_train)
    print(metrics.classification_report(y_train, y_pred, target_names=['accuweather',
                                                                       'allianz',
                                                                       'amazon',
                                                                       'citigroup',
                                                                       'fujitsu',
                                                                       'garmin',
                                                                       'github',
                                                                       'ibm',
                                                                       'metlife',
                                                                       'uber',
                                                                       'yahoo',
                                                                       'zurich']))

    y_pred = pipeline.predict(X_test)
    print(metrics.classification_report(y_test, y_pred, target_names=['accuweather',
                                                                       'allianz',
                                                                       'amazon',
                                                                       'citigroup',
                                                                       'fujitsu',
                                                                       'garmin',
                                                                       'github',
                                                                       'ibm',
                                                                       'metlife',
                                                                       'uber',
                                                                       'yahoo',
                                                                       'zurich']))

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
             precision    recall  f1-score   support

accuweather       0.26      0.93      0.41        14
    allianz       0.47      0.89      0.62        19
     amazon       0.70      0.78      0.74        27
  citigroup       0.84      0.76      0.80        21
    fujitsu       0.76      0.68      0.72        19
     garmin       0.92      0.71      0.80        17
     github       0.82      0.53      0.64        17
        ibm       0.88      0.79      0.83        19
    metlife       1.00      0.11      0.20        18
       uber       0.86      0.43      0.57        14
      yahoo       1.00      0.36      0.53        14
     zurich       1.00      0.50      0.67        16

avg / total       0.79      0.64      0.64       215

             precision    recall  f1-score   support

accuweather       0.24      0.83      0.37         6
    allianz       0.75      0.75      0.75         4
     amazon       0.33      1.0

Cómo se puede ver en el análisis anterior el modelo que mejor clasifico los datos fue LinearSVC. Por lo que se proseguirá con ese y se harán las optimizaciones finales.

In [276]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)
print(metrics.classification_report(y_train, y_pred, target_names=['accuweather',
                                                                       'allianz',
                                                                       'amazon',
                                                                       'citigroup',
                                                                       'fujitsu',
                                                                       'garmin',
                                                                       'github',
                                                                       'ibm',
                                                                       'metlife',
                                                                       'uber',
                                                                       'yahoo',
                                                                       'zurich']))

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['accuweather',
                                                                   'allianz',
                                                                   'amazon',
                                                                   'citigroup',
                                                                   'fujitsu',
                                                                   'garmin',
                                                                   'github',
                                                                   'ibm',
                                                                   'metlife',
                                                                   'uber',
                                                                   'yahoo',
                                                                   'zurich']))

             precision    recall  f1-score   support

accuweather       1.00      1.00      1.00        14
    allianz       1.00      1.00      1.00        19
     amazon       1.00      1.00      1.00        27
  citigroup       1.00      1.00      1.00        21
    fujitsu       1.00      1.00      1.00        19
     garmin       1.00      1.00      1.00        17
     github       1.00      1.00      1.00        17
        ibm       1.00      1.00      1.00        19
    metlife       1.00      1.00      1.00        18
       uber       1.00      1.00      1.00        14
      yahoo       1.00      1.00      1.00        14
     zurich       1.00      1.00      1.00        16

avg / total       1.00      1.00      1.00       215

             precision    recall  f1-score   support

accuweather       1.00      1.00      1.00         6
    allianz       0.75      0.75      0.75         4
     amazon       0.20      1.00      0.33         1
  citigroup       0.25      0.50      0.33

In [277]:
description = 'everyones private driver'
dataset['target_names'][int(pipeline.predict([description]))]

'uber'

Luego de hacer algunas pruebas, se procede a optimizar el modelo elejido buscando subir un poco los resultados en test y mejorar la precision.

In [278]:
param_grid = {
    'vect__binary': [True],
    'vect__lowercase': [True],
    'vect__sublinear_tf': [True, False],
    'vect__ngram_range': [(1, 3), (1, 4),(1, 7)],
    'vect__strip_accents': ['ascii'],
    'vect__analyzer': ['word'],
    'vect__min_df': [1, 2, 3],
    'vect__max_df': [1.],
    'clf__multi_class' : ['ovr', 'crammer_singer'],
    'clf__random_state': [0],
    'clf__fit_intercept':[True, False],
    'clf__loss':['hinge', 'squared_hinge'],
    'clf__C':[1.0, 0.1],
}

params_list = list(ParameterGrid(param_grid))

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

results = []
print(len(params_list))

bar = progressbar.ProgressBar(maxval=len(params_list), 
                              widgets=[progressbar.Bar('=', '[', ']'), ' ', 
                                       progressbar.Percentage()])
bar.start()


i = 0
barra = ''
for params in params_list:
    bar.update(i+1)
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    result = {'acc': acc, 'f1': f1}
        
    results.append({
        **result,
        **params,
    })
    i += 1
bar.finish()

[                                                                        ]   1%

288




In [279]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__fit_intercept,clf__loss,clf__multi_class,clf__random_state,f1,vect__analyzer,vect__binary,vect__lowercase,vect__max_df,vect__min_df,vect__ngram_range,vect__strip_accents,vect__sublinear_tf
216,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 3)",ascii,True
217,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 3)",ascii,False
218,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 4)",ascii,True
219,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 4)",ascii,False
220,0.842105,0.1,False,hinge,ovr,0,0.803276,word,True,True,1.0,1,"(1, 7)",ascii,True
221,0.842105,0.1,False,hinge,ovr,0,0.803276,word,True,True,1.0,1,"(1, 7)",ascii,False
234,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 3)",ascii,True
235,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 3)",ascii,False
236,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 4)",ascii,True
237,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 4)",ascii,False


Finalmente nos quedamos con la configuración que mejor a dado:

In [301]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        lowercase=True,
        strip_accents='ascii',
        analyzer='word',
        min_df=1,
        max_df=1.,
        ngram_range=(1, 3),
    )),
    ('clf', LinearSVC(C=0.1, loss='hinge', multi_class='ovr', penalty='l2', random_state=0, fit_intercept=False)),
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['accuweather',
                                                                   'allianz',
                                                                   'amazon',
                                                                   'citigroup',
                                                                   'fujitsu',
                                                                   'garmin',
                                                                   'github',
                                                                   'ibm',
                                                                   'metlife',
                                                                   'uber',
                                                                   'yahoo',
                                                                   'zurich']))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

             precision    recall  f1-score   support

accuweather       0.86      1.00      0.92         6
    allianz       0.75      0.75      0.75         4
     amazon       1.00      1.00      1.00         1
  citigroup       0.25      0.50      0.33         2
    fujitsu       1.00      0.50      0.67         2
     garmin       1.00      0.67      0.80         3
     github       1.00      1.00      1.00         3
        ibm       1.00      1.00      1.00         1
    metlife       1.00      0.50      0.67         2
       uber       1.00      0.83      0.91         6
      yahoo       1.00      1.00      1.00         5
     zurich       0.75      1.00      0.86         3

avg / total       0.89      0.84      0.85        38

[[6 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 2 0 0 0 0 0 1]
 [0 0 0 0 0 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 

In [8]:
param_grid = {
    'vect__binary': [True],
    'vect__lowercase': [True],
    'vect__sublinear_tf': [True, False],
    'vect__ngram_range': [(1, 3), (1, 4),(1, 7)],
    'vect__strip_accents': ['ascii'],
    'vect__analyzer': ['word'],
    'vect__min_df': [1, 2, 3, 5, 6],
    'vect__max_df': [1.],
    'clf__multi_class' : ['ovr'],
    'clf__random_state': [0],
    'clf__fit_intercept':[True],
    'clf__C':[1.0, 0.1, 100, 10],
    'clf__n_jobs': [-1],
}

params_list = list(ParameterGrid(param_grid))

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

results = []
print(len(params_list))

bar = progressbar.ProgressBar(maxval=len(params_list), 
                              widgets=[progressbar.Bar('=', '[', ']'), ' ', 
                                       progressbar.Percentage()])
bar.start()


i = 0
barra = ''
for params in params_list:
    bar.update(i+1)
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_train)
    acc = metrics.accuracy_score(y_train, y_pred)
    f1 = metrics.f1_score(y_train, y_pred, average='macro')
    result = {'acc': acc, 'f1': f1}
        
    results.append({
        **result,
        **params,
    })
    i += 1
bar.finish()

[                                                                        ]   0%

120




In [9]:
results_df = pd.DataFrame(results)
pd.set_option('display.max_rows', results_df.shape[0])
pd.set_option('display.max_columns', results_df.shape[1])
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__fit_intercept,clf__multi_class,clf__n_jobs,clf__random_state,f1,vect__analyzer,vect__binary,vect__lowercase,vect__max_df,vect__min_df,vect__ngram_range,vect__strip_accents,vect__sublinear_tf
60,0.988304,100.0,True,ovr,-1,0,0.985364,word,True,True,1.0,1,"(1, 3)",ascii,True
61,0.988304,100.0,True,ovr,-1,0,0.985364,word,True,True,1.0,1,"(1, 3)",ascii,False
62,0.988304,100.0,True,ovr,-1,0,0.985364,word,True,True,1.0,1,"(1, 4)",ascii,True
63,0.988304,100.0,True,ovr,-1,0,0.985364,word,True,True,1.0,1,"(1, 4)",ascii,False
64,0.988304,100.0,True,ovr,-1,0,0.985364,word,True,True,1.0,1,"(1, 7)",ascii,True
65,0.988304,100.0,True,ovr,-1,0,0.985364,word,True,True,1.0,1,"(1, 7)",ascii,False
90,0.976608,10.0,True,ovr,-1,0,0.978458,word,True,True,1.0,1,"(1, 3)",ascii,True
91,0.976608,10.0,True,ovr,-1,0,0.978458,word,True,True,1.0,1,"(1, 3)",ascii,False
92,0.976608,10.0,True,ovr,-1,0,0.978458,word,True,True,1.0,1,"(1, 4)",ascii,True
93,0.976608,10.0,True,ovr,-1,0,0.978458,word,True,True,1.0,1,"(1, 4)",ascii,False


In [10]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        lowercase= True,
        binary=True,
        strip_accents='ascii',
        analyzer='word',
        min_df=1,
        max_df=1.,
        ngram_range=(1, 3),
    )),
    ('clf', LogisticRegression(C=100.0, multi_class='ovr', penalty='l2', random_state=0, fit_intercept=True)),
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)
print(metrics.classification_report(y_train, y_pred, target_names=['Bealls', 'EBS_Minds_IT', 'ICON_Technologies', 'Lorhan', 'Nordstrom']))
cm = metrics.confusion_matrix(y_train, y_pred)
print(cm)

                   precision    recall  f1-score   support

           Bealls       1.00      1.00      1.00        19
     EBS_Minds_IT       1.00      1.00      1.00        49
ICON_Technologies       1.00      0.89      0.94        19
           Lorhan       0.94      1.00      0.97        30
        Nordstrom       1.00      1.00      1.00        19

      avg / total       0.99      0.99      0.99       171

[[19  0  0  0  0  0]
 [ 0 49  0  0  0  0]
 [ 0  0 17  2  0  0]
 [ 0  0  0 30  0  0]
 [ 0  0  0  0 19  0]
 [ 0  0  0  0  0 35]]


In [12]:
description = 'Bachelors degree (computer or engineering related field) and 8 years of experience as a project or program manager.'
dataset['target_names'][int(pipeline.predict([description]))]

'ICON_Technologies'

In [49]:
pred_array = pipeline.predict_proba([description])[0]

In [50]:
best_pred = pred_array.argsort()[-3:][::-1]
best_pred

array([13, 16,  3])

In [51]:
for pred in best_pred:
    #print(pred)
    print(str(dataset['target_names'][pred]) + "   " + str(pred_array[pred]))


marriott   0.1972232688324579
publix_super_markets   0.061514024222963315
cadbury_schweppes   0.05167765451020214


In [24]:
import pickle

filename = 'job_description_classifier'
f = open(filename, 'wb')
pickle.dump(pipeline, f)