Skip to content
This repository has been archived by the owner on Oct 24, 2019. It is now read-only.

Commit

Permalink
Current progress.
Browse files Browse the repository at this point in the history
- Add fine tuning
  • Loading branch information
whydinkov committed May 30, 2019
1 parent 0629d2b commit d696d80
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 32 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,5 @@ venv.bak/
# custom
.vscode/
notebooks/
*.log
*.log
*.xlsx
23 changes: 17 additions & 6 deletions src/classifier/sklearn/ranlp_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Dropout
from keras.models import Sequential
# from keras.wrappers.scikit_learn import KerasClassifier
# from keras.layers import Dense, Dropout
# from keras.models import Sequential

dir_path = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -59,7 +59,7 @@ def _pipe_column(name):
]))


def make(classifier, columns, oversampler=None):
def make(classifier, columns, oversampler=None, clf_params={}):
feat_pipes = []

for column in columns:
Expand All @@ -71,18 +71,29 @@ def make(classifier, columns, oversampler=None):
feat_pipes.append(__lsa_text)
else:
feat_pipes.append(_pipe_column(column))

default_params = {
'clf__max_iter': 1000,
'clf__random_state': 0,
'clf__multi_class': "auto"
}

if oversampler:
return Pipeline([
pipeline = Pipeline([
('feats', FeatureUnion(feat_pipes)),
('oversampler', oversampler),
('clf', classifier)
])
else:
return Pipeline([
pipeline = Pipeline([
('feats', FeatureUnion(feat_pipes)),
('clf', classifier)
])

params = {**default_params, **clf_params}
pipeline.set_params(**params)

return pipeline

# def _mlp_arch(input_dim):
# model = Sequential()
Expand Down
51 changes: 51 additions & 0 deletions src/experiments/ranlp/logistic_regression/feature_combination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# imports, config
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from src.preprocessing.transformator import get_df
from src.evaluation.compare import compare_classifiers
from src.classifier.sklearn import ranlp_pipelines
from src.data_retrieval.helpers import database
import pandas as pd


def warn(*args, **kwargs):
pass


warnings.warn = warn


db = database.MongoDB()

df = get_df(list(db.get_articles()))

# models
feature_sets = ['bg_bert', 'bg_xlm', 'bg_styl', 'bg_lsa',
'en_use', 'en_nela', 'en_bert', 'en_elmo']
features = [
('top_1', ['bg_lsa_title', 'bg_lsa_text']),
('top_2', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text']),
('top_3', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text']),
('top_4', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text']),
('top_5', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text']),
('top_6', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text', 'meta_media']),
('top_7', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title', 'bg_xlm_text']),
('top_8', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title', 'bg_xlm_text', 'en_nela_title', 'en_nela_text']),
('top_9', ['bg_lsa_title', 'bg_lsa_text', 'en_elmo_title', 'en_elmo_text', 'en_use_title', 'en_use_text', 'en_bert_title', 'en_bert_text', 'bg_bert_title', 'bg_bert_text', 'meta_media', 'bg_xlm_title', 'bg_xlm_text', 'en_nela_title', 'en_nela_text', 'bg_styl_title', 'bg_styl_textx']),
]

oversampler = None


models = []
for name, feature_list in features:
clf = LogisticRegression()
clf_params = {'clf__C': 1.5, 'clf__solver': 'liblinear', 'clf__tol': 0.01}
model = ranlp_pipelines.make(clf, feature_list, clf_params=clf_params)

# evaluation
models.append((f'{name}', model))

compare_classifiers(models, df, df['label'], silent=False, plot=False)
19 changes: 19 additions & 0 deletions src/experiments/ranlp/logistic_regression/plot_top_n.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import matplotlib.pyplot as plt

feats = [
['top_1', 0.4825940755],
['top_2', 0.4860865399],
['top_3', 0.498893917],
['top_4', 0.3657454759],
['top_5', 0.3657454759],
['top_6', 0.3808515722],
['top_7', 0.3808515722],
['top_8', 0.3847330075],
]

fig, ax = plt.subplots()
ax.set_ylim(0.2, 0.6)
ax.axhline(0.3030, ls='--')
plt.plot([x[0] for x in feats], [x[1] for x in feats], '-o')
plt.show()

32 changes: 17 additions & 15 deletions src/experiments/ranlp/oversampling/fine_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,40 +35,42 @@ def warn(*args, **kwargs):
en_feats = [x for x in all_feats if x.startswith('en_')] + ['meta_media']

features = [
('bg_bert_title_text', ['bg_bert_title', 'bg_bert_text']),
('bg_xlm_title_text', ['bg_xlm_title', 'bg_xlm_text']),
('bg_styl_title_text', ['bg_styl_title', 'bg_styl_text']),
('bg_lsa_title_text', ['bg_lsa_title', 'bg_lsa_text']),
('all_bg', bg_feats),
('en_use_title_text', ['en_use_title', 'en_use_text']),
('en_nela_title_text', ['en_nela_title', 'en_nela_text']),
('en_bert_title_text', ['en_bert_title', 'en_bert_text']),
('en_elmo_title_text', ['en_elmo_title', 'en_elmo_text']),
('all_en', en_feats),
('all', all_feats)
#('bg_bert_title_text', ['bg_bert_title', 'bg_bert_text']),
#('bg_xlm_title_text', ['bg_xlm_title', 'bg_xlm_text']),
#('bg_styl_title_text', ['bg_styl_title', 'bg_styl_text']),
#('bg_lsa_title_text', ['bg_lsa_title', 'bg_lsa_text']),
#('all_bg', bg_feats),
#('en_use_title_text', ['en_use_title', 'en_use_text']),
#('en_nela_title_text', ['en_nela_title', 'en_nela_text']),
#('en_bert_title_text', ['en_bert_title', 'en_bert_text']),
#('en_elmo_title_text', ['en_elmo_title', 'en_elmo_text']),
# ('all_en', en_feats),
('all', all_feats)
]
oversampler = None

# evaluation
param_grid = {
'clf__tol': [1e-10, 1e-8, 1e-4, 1e-2, 1e-1], # 1e-4
# 'clf__tol': [1e-2], # [1e-10, 1e-8, 1e-4, 1e-2, 1e-1], # 1e-4
'clf__C': [0.05, 0.15, 0.25, 0.35, 0.50, 0.75, 1, 1.25, 1.5, 2], # 1,
'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
'clf__solver': ['liblinear'] #['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

for name, feature_list in features:
model = ranlp_pipelines.make(LogisticRegression(
random_state=0,
multi_class="auto",
max_iter=2500
max_iter=1000
), feature_list)

print(name)

gs = GridSearchCV(model,
param_grid=param_grid,
scoring='accuracy',
cv=5,
error_score=-1,
verbose=1,
verbose=10000,
n_jobs=-1,
iid=False,
return_train_score=True)
Expand Down
21 changes: 11 additions & 10 deletions src/experiments/ranlp/oversampling/oversampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
db = database.MongoDB()

clf = LogisticRegression(
C=0.5,
C=1.5,
tol=0.01,
random_state=0,
multi_class="auto",
Expand All @@ -26,14 +26,15 @@

for oversampler in [None, SMOTE(), ADASYN(), RandomOverSampler(random_state=0)]:
models = [
('bg_bert_title_text', ranlp_pipelines.make(clf, oversampler, ['bg_bert_title', 'bg_bert_text', 'bg_bert_cos'])),
('bg_xlm_title_text', ranlp_pipelines.make(clf, oversampler, ['bg_xlm_title', 'bg_xlm_text', 'bg_xlm_cos'])),
('bg_styl_title_text', ranlp_pipelines.make(clf, oversampler, ['bg_styl_title', 'bg_styl_text'])),
('meta_media', ranlp_pipelines.make(clf, oversampler, ['meta_media'])),
('en_use_title_text', ranlp_pipelines.make(clf, oversampler, ['en_use_title', 'en_use_text', 'en_use_cos'])),
('en_nela_title_text', ranlp_pipelines.make(clf, oversampler, ['en_nela_title', 'en_nela_text', 'en_nela_cos'])),
('en_bert_title_text', ranlp_pipelines.make(clf, oversampler, ['en_bert_title', 'en_bert_text', 'en_bert_cos'])),
('en_elmo_title_text', ranlp_pipelines.make(clf, oversampler, ['en_elmo_title', 'en_elmo_text', 'en_elmo_cos'])),
('bg_lsa', ranlp_pipelines.make(clf, ['bg_lsa_title', 'bg_lsa_text'], oversampler=oversampler))
#('bg_bert_title_text', ranlp_pipelines.make(clf, ['bg_bert_title', 'bg_bert_text'], oversampler=oversampler)),
# ('bg_xlm_title_text', ranlp_pipelines.make(clf, oversampler, ['bg_xlm_title', 'bg_xlm_text', 'bg_xlm_cos'])),
# ('bg_styl_title_text', ranlp_pipelines.make(clf, oversampler, ['bg_styl_title', 'bg_styl_text'])),
# ('meta_media', ranlp_pipelines.make(clf, oversampler, ['meta_media'])),
# ('en_use_title_text', ranlp_pipelines.make(clf, oversampler, ['en_use_title', 'en_use_text', 'en_use_cos'])),
# ('en_nela_title_text', ranlp_pipelines.make(clf, oversampler, ['en_nela_title', 'en_nela_text', 'en_nela_cos'])),
# ('en_bert_title_text', ranlp_pipelines.make(clf, oversampler, ['en_bert_title', 'en_bert_text', 'en_bert_cos'])),
# ('en_elmo_title_text', ranlp_pipelines.make(clf, oversampler, ['en_elmo_title', 'en_elmo_text', 'en_elmo_cos'])),
]
print(f'Oversampler: {oversampler}')
print(f'Oversampler: { oversampler}')
compare_classifiers(models, df, df['label'], silent=False, plot=False)
65 changes: 65 additions & 0 deletions src/experiments/ranlp/save_tuned_predictions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import pandas as pd

from src.data_retrieval.helpers import database
from src.classifier.sklearn import ranlp_pipelines
from src.evaluation.compare import compare_classifiers
from src.preprocessing.transformator import get_df
from sklearn.model_selection import cross_val_predict, GridSearchCV

from sklearn.linear_model import LogisticRegression

db = database.MongoDB()

articles = list(db.get_articles())
df = get_df(articles)

clf = LogisticRegression()

feature_sets = ['bg_bert', 'bg_xlm', 'bg_styl', 'bg_lsa',
'en_use', 'en_nela', 'en_bert', 'en_elmo']

all_feats = []
for feature_set in feature_sets:
all_feats.append(feature_set + '_title')
all_feats.append(feature_set + '_text')
all_feats.append('meta_media')


param_grid = {
'clf__tol': [1e-10, 1e-8, 1e-4, 1e-2, 1e-1], # 1e-4
'clf__C': [0.05, 0.15, 0.25, 0.35, 0.50, 0.75, 1, 1.25, 1.5, 2], # 1,
'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] # lbfgs
}

print('All features count: ', len(all_feats))
for feature_set in all_feats:
model = ranlp_pipelines.make(clf, [feature_set])

gs = GridSearchCV(model,
param_grid=param_grid,
scoring='accuracy',
cv=5,
error_score=-1,
verbose=1,
n_jobs=-1,
iid=False,
return_train_score=True)

gs.fit(df, df['label'])

pred = cross_val_predict(gs.best_estimator_,
df,
df['label'],
cv=5,
method='predict_proba')

for article, article_pred in zip(articles, pred):
if 'tuned_predictions' not in article:
article['tuned_predictions'] = {}

article['tuned_predictions'][feature_set] = article_pred.tolist()

db.save_article(article)

print(f'Done for {feature_set}')

0 comments on commit d696d80

Please sign in to comment.