In [393]:
from sklearn.datasets import load_files
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import progressbar
import warnings
import numpy as np
import pickle


warnings.filterwarnings("ignore")

In [320]:
dataset = load_files('Dataset/descriptions', shuffle=False, encoding='utf-8')

In [321]:
dataset.keys()

dict_keys(['target_names', 'filenames', 'data', 'target', 'DESCR'])

In [322]:
dataset['target_names']

['accuweather',
 'allianz',
 'amazon',
 'citigroup',
 'fujitsu',
 'garmin',
 'github',
 'ibm',
 'metlife',
 'uber',
 'yahoo',
 'zurich']

In [323]:
data = pd.DataFrame({'data': dataset['data'], 'target': dataset['target']})


In [324]:
data[data['target'] == 2].data[44]

'help organizations move faster, lower IT costs, and scale applications'

In [325]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data,
    dataset.target,
    test_size=0.0001, # For Overfitting
    #test_size=0.15,
    random_state=42
)

In [326]:
Counter(y_train), Counter(y_test)

(Counter({0: 20,
          1: 23,
          2: 28,
          3: 23,
          4: 21,
          5: 20,
          6: 20,
          7: 20,
          8: 20,
          9: 19,
          10: 19,
          11: 19}),
 Counter({9: 1}))

In [269]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [270]:
x = vect.transform([X_train[0]])
x

<1x692 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [271]:
[(i, x[0, i]) for i in range(662) if x[0, i]]

[(38, 1), (105, 1), (143, 1), (273, 1), (309, 1), (398, 1), (486, 1), (533, 1)]

In [272]:
features = vect.get_feature_names()
features

['000',
 '100',
 '147',
 '159',
 '160',
 '175',
 '200',
 '50',
 '70',
 '90',
 'ability',
 'access',
 'accessed',
 'accessible',
 'accident',
 'accordingly',
 'accounts',
 'accuweather',
 'across',
 'activity',
 'adapt',
 'administration',
 'advances',
 'advantage',
 'advertisers',
 'advertising',
 'advises',
 'affiliate',
 'agency',
 'aims',
 'aircraft',
 'all',
 'allianz',
 'also',
 'amazon',
 'an',
 'analysis',
 'analytics',
 'and',
 'annuities',
 'anywhere',
 'apis',
 'application',
 'applications',
 'approximately',
 'apps',
 'archive',
 'are',
 'areas',
 'armed',
 'around',
 'array',
 'art',
 'as',
 'ask',
 'asset',
 'at',
 'atm',
 'automated',
 'automotive',
 'available',
 'aviation',
 'aws',
 'backed',
 'bank',
 'banking',
 'banks',
 'barcode',
 'based',
 'be',
 'beat',
 'because',
 'been',
 'benefit',
 'better',
 'between',
 'beyond',
 'big',
 'both',
 'branches',
 'brand',
 'brighthouse',
 'bring',
 'broad',
 'brokerage',
 'bug',
 'build',
 'builds',
 'built',
 'business',
 'b

In [273]:
[(features[i], x[0, i]) for i in range(x.shape[1]) if x[0, i]]

[('and', 1),
 ('clients', 1),
 ('corporate', 1),
 ('high', 1),
 ('institutional', 1),
 ('net', 1),
 ('public', 1),
 ('sector', 1),
 ('worth', 1)]

In [274]:
clfs = [
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(max_depth = 2, random_state=0),
    LogisticRegression(random_state=0),
    LinearSVC(random_state=0),
    SVC(random_state=0),
    RandomForestClassifier(random_state=0),
]

In [275]:
vect = CountVectorizer(binary=True)

for clf in clfs:
    print(str(clf.__class__))
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_train)
    print(metrics.classification_report(y_train, y_pred, target_names=['accuweather',
                                                                       'allianz',
                                                                       'amazon',
                                                                       'citigroup',
                                                                       'fujitsu',
                                                                       'garmin',
                                                                       'github',
                                                                       'ibm',
                                                                       'metlife',
                                                                       'uber',
                                                                       'yahoo',
                                                                       'zurich']))

    y_pred = pipeline.predict(X_test)
    print(metrics.classification_report(y_test, y_pred, target_names=['accuweather',
                                                                       'allianz',
                                                                       'amazon',
                                                                       'citigroup',
                                                                       'fujitsu',
                                                                       'garmin',
                                                                       'github',
                                                                       'ibm',
                                                                       'metlife',
                                                                       'uber',
                                                                       'yahoo',
                                                                       'zurich']))

<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
             precision    recall  f1-score   support

accuweather       0.26      0.93      0.41        14
    allianz       0.47      0.89      0.62        19
     amazon       0.70      0.78      0.74        27
  citigroup       0.84      0.76      0.80        21
    fujitsu       0.76      0.68      0.72        19
     garmin       0.92      0.71      0.80        17
     github       0.82      0.53      0.64        17
        ibm       0.88      0.79      0.83        19
    metlife       1.00      0.11      0.20        18
       uber       0.86      0.43      0.57        14
      yahoo       1.00      0.36      0.53        14
     zurich       1.00      0.50      0.67        16

avg / total       0.79      0.64      0.64       215

             precision    recall  f1-score   support

accuweather       0.24      0.83      0.37         6
    allianz       0.75      0.75      0.75         4
     amazon       0.33      1.0

Cómo se puede ver en el análisis anterior el modelo que mejor clasifico los datos fue LinearSVC. Por lo que se proseguirá con ese y se harán las optimizaciones finales.

In [276]:
pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', LinearSVC(random_state=0)),
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)
print(metrics.classification_report(y_train, y_pred, target_names=['accuweather',
                                                                       'allianz',
                                                                       'amazon',
                                                                       'citigroup',
                                                                       'fujitsu',
                                                                       'garmin',
                                                                       'github',
                                                                       'ibm',
                                                                       'metlife',
                                                                       'uber',
                                                                       'yahoo',
                                                                       'zurich']))

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['accuweather',
                                                                   'allianz',
                                                                   'amazon',
                                                                   'citigroup',
                                                                   'fujitsu',
                                                                   'garmin',
                                                                   'github',
                                                                   'ibm',
                                                                   'metlife',
                                                                   'uber',
                                                                   'yahoo',
                                                                   'zurich']))

             precision    recall  f1-score   support

accuweather       1.00      1.00      1.00        14
    allianz       1.00      1.00      1.00        19
     amazon       1.00      1.00      1.00        27
  citigroup       1.00      1.00      1.00        21
    fujitsu       1.00      1.00      1.00        19
     garmin       1.00      1.00      1.00        17
     github       1.00      1.00      1.00        17
        ibm       1.00      1.00      1.00        19
    metlife       1.00      1.00      1.00        18
       uber       1.00      1.00      1.00        14
      yahoo       1.00      1.00      1.00        14
     zurich       1.00      1.00      1.00        16

avg / total       1.00      1.00      1.00       215

             precision    recall  f1-score   support

accuweather       1.00      1.00      1.00         6
    allianz       0.75      0.75      0.75         4
     amazon       0.20      1.00      0.33         1
  citigroup       0.25      0.50      0.33

In [277]:
description = 'everyones private driver'
dataset['target_names'][int(pipeline.predict([description]))]

'uber'

Luego de hacer algunas pruebas, se procede a optimizar el modelo elejido buscando subir un poco los resultados en test y mejorar la precision.

In [278]:
param_grid = {
    'vect__binary': [True],
    'vect__lowercase': [True],
    'vect__sublinear_tf': [True, False],
    'vect__ngram_range': [(1, 3), (1, 4),(1, 7)],
    'vect__strip_accents': ['ascii'],
    'vect__analyzer': ['word'],
    'vect__min_df': [1, 2, 3],
    'vect__max_df': [1.],
    'clf__multi_class' : ['ovr', 'crammer_singer'],
    'clf__random_state': [0],
    'clf__fit_intercept':[True, False],
    'clf__loss':['hinge', 'squared_hinge'],
    'clf__C':[1.0, 0.1],
}

params_list = list(ParameterGrid(param_grid))

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

results = []
print(len(params_list))

bar = progressbar.ProgressBar(maxval=len(params_list), 
                              widgets=[progressbar.Bar('=', '[', ']'), ' ', 
                                       progressbar.Percentage()])
bar.start()


i = 0
barra = ''
for params in params_list:
    bar.update(i+1)
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average='macro')
    result = {'acc': acc, 'f1': f1}
        
    results.append({
        **result,
        **params,
    })
    i += 1
bar.finish()

[                                                                        ]   1%

288




In [279]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__fit_intercept,clf__loss,clf__multi_class,clf__random_state,f1,vect__analyzer,vect__binary,vect__lowercase,vect__max_df,vect__min_df,vect__ngram_range,vect__strip_accents,vect__sublinear_tf
216,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 3)",ascii,True
217,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 3)",ascii,False
218,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 4)",ascii,True
219,0.842105,0.1,False,hinge,ovr,0,0.825498,word,True,True,1.0,1,"(1, 4)",ascii,False
220,0.842105,0.1,False,hinge,ovr,0,0.803276,word,True,True,1.0,1,"(1, 7)",ascii,True
221,0.842105,0.1,False,hinge,ovr,0,0.803276,word,True,True,1.0,1,"(1, 7)",ascii,False
234,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 3)",ascii,True
235,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 3)",ascii,False
236,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 4)",ascii,True
237,0.842105,0.1,False,hinge,crammer_singer,0,0.803276,word,True,True,1.0,1,"(1, 4)",ascii,False


Finalmente nos quedamos con la configuración que mejor a dado:

In [301]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        lowercase=True,
        strip_accents='ascii',
        analyzer='word',
        min_df=1,
        max_df=1.,
        ngram_range=(1, 3),
    )),
    ('clf', LinearSVC(C=0.1, loss='hinge', multi_class='ovr', penalty='l2', random_state=0, fit_intercept=False)),
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['accuweather',
                                                                   'allianz',
                                                                   'amazon',
                                                                   'citigroup',
                                                                   'fujitsu',
                                                                   'garmin',
                                                                   'github',
                                                                   'ibm',
                                                                   'metlife',
                                                                   'uber',
                                                                   'yahoo',
                                                                   'zurich']))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

             precision    recall  f1-score   support

accuweather       0.86      1.00      0.92         6
    allianz       0.75      0.75      0.75         4
     amazon       1.00      1.00      1.00         1
  citigroup       0.25      0.50      0.33         2
    fujitsu       1.00      0.50      0.67         2
     garmin       1.00      0.67      0.80         3
     github       1.00      1.00      1.00         3
        ibm       1.00      1.00      1.00         1
    metlife       1.00      0.50      0.67         2
       uber       1.00      0.83      0.91         6
      yahoo       1.00      1.00      1.00         5
     zurich       0.75      1.00      0.86         3

avg / total       0.89      0.84      0.85        38

[[6 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 2 0 0 0 0 0 1]
 [0 0 0 0 0 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 

In [327]:
param_grid = {
    'vect__binary': [True],
    'vect__lowercase': [True],
    'vect__sublinear_tf': [True, False],
    'vect__ngram_range': [(1, 3), (1, 4),(1, 7)],
    'vect__strip_accents': ['ascii'],
    'vect__analyzer': ['word'],
    'vect__min_df': [1, 2, 3, 5, 6],
    'vect__max_df': [1.],
    'clf__multi_class' : ['ovr'],
    'clf__random_state': [0],
    'clf__fit_intercept':[True],
    'clf__C':[1.0, 0.1, 100, 10],
    'clf__n_jobs': [-1],
}

params_list = list(ParameterGrid(param_grid))

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

results = []
print(len(params_list))

bar = progressbar.ProgressBar(maxval=len(params_list), 
                              widgets=[progressbar.Bar('=', '[', ']'), ' ', 
                                       progressbar.Percentage()])
bar.start()


i = 0
barra = ''
for params in params_list:
    bar.update(i+1)
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_train)
    acc = metrics.accuracy_score(y_train, y_pred)
    f1 = metrics.f1_score(y_train, y_pred, average='macro')
    result = {'acc': acc, 'f1': f1}
        
    results.append({
        **result,
        **params,
    })
    i += 1
bar.finish()

[=                                                                       ]   1%

120




In [328]:
results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,clf__C,clf__fit_intercept,clf__multi_class,clf__n_jobs,clf__random_state,f1,vect__analyzer,vect__binary,vect__lowercase,vect__max_df,vect__min_df,vect__ngram_range,vect__strip_accents,vect__sublinear_tf
0,1.0,1.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 3)",ascii,True
1,1.0,1.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 3)",ascii,False
2,1.0,1.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 4)",ascii,True
3,1.0,1.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 4)",ascii,False
4,1.0,1.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 7)",ascii,True
5,1.0,1.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 7)",ascii,False
60,1.0,100.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 3)",ascii,True
61,1.0,100.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 3)",ascii,False
62,1.0,100.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 4)",ascii,True
63,1.0,100.0,True,ovr,-1,0,1.0,word,True,True,1.0,1,"(1, 4)",ascii,False


In [395]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        strip_accents='ascii',
        analyzer='word',
        min_df=1,
        max_df=1.,
        ngram_range=(1, 3),
    )),
    ('clf', LogisticRegression(C=1.0, multi_class='ovr', penalty='l2', random_state=0, fit_intercept=True)),
])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)
print(metrics.classification_report(y_train, y_pred, target_names=['accuweather',
                                                                   'allianz',
                                                                   'amazon',
                                                                   'citigroup',
                                                                   'fujitsu',
                                                                   'garmin',
                                                                   'github',
                                                                   'ibm',
                                                                   'metlife',
                                                                   'uber',
                                                                   'yahoo',
                                                                   'zurich']))
cm = metrics.confusion_matrix(y_train, y_pred)
print(cm)

             precision    recall  f1-score   support

accuweather       1.00      1.00      1.00        20
    allianz       1.00      1.00      1.00        23
     amazon       1.00      1.00      1.00        28
  citigroup       1.00      1.00      1.00        23
    fujitsu       1.00      1.00      1.00        21
     garmin       1.00      1.00      1.00        20
     github       1.00      1.00      1.00        20
        ibm       1.00      1.00      1.00        20
    metlife       1.00      1.00      1.00        20
       uber       1.00      1.00      1.00        19
      yahoo       1.00      1.00      1.00        19
     zurich       1.00      1.00      1.00        19

avg / total       1.00      1.00      1.00       252

[[20  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 23  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 28  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 23  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 21  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 20  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 2

In [396]:
description = 'repositories'
dataset['target_names'][int(pipeline.predict([description]))]

'github'

In [397]:
pred_array = pipeline.predict_proba([description])[0]

In [398]:
best_pred = pred_array.argsort()[-3:][::-1]
best_pred

array([6, 2, 3])

In [399]:
for pred in best_pred:
    #print(pred)
    print(str(dataset['target_names'][pred]) + "   " + str(pred_array[pred]))


github   0.2559808449419713
amazon   0.08370827219471051
citigroup   0.07122536590251825


In [400]:
import pickle

filename = 'company_clasifier_final_model'
f = open(filename, 'wb')
pickle.dump(pipeline, f)