In [1]:
import json
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from time import time
from datetime import datetime

In [2]:
DATASET_FILE = 'dataset.json'
with open(DATASET_FILE) as file:
    dataset = json.load(file)

In [3]:
def feachure_extractor(sent):
    feachure_sent = []
    for i, word in enumerate(sent):
        features = dict()
        features['word'] = word
        features['word-1'] = '<S>' if not i else sent[i-1]
        features['is title'] = word.istitle()
        features['start'] = True if not i else False
        features['end'] = True if i == len(sent)-1 else False
        features['is_upper'] = word.isupper()
        feachure_sent.append(features)
    return feachure_sent


In [4]:
feachures_dataset = []
labels_dataset = []
for sent, labels in zip(dataset['dev']['tokens'], dataset['dev']['labels']):
    feachures_dataset.extend(feachure_extractor(sent))
    labels_dataset.extend(labels)




In [5]:
vec = DictVectorizer()
vec = vec.fit(feachures_dataset)
print("Total number of features: {}\nFeature list: {}".format(len(vec.get_feature_names()),vec.get_feature_names()))



Total number of features: 124332


In [6]:
features_vectorized = vec.transform(feachures_dataset)

X_train, X_val, y_train, y_val = train_test_split(features_vectorized, labels_dataset, test_size=0.33, random_state=42)


#lrc = LogisticRegression(random_state=42, solver="sag", multi_class="multinomial",max_iter=1000, verbose=1)


In [7]:
parametersSGD = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 500, 1000, 2000, 5000),
    #'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams or thrigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    # 'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (80, 100, 200),
}

### SGD
pipelineSGD = Pipeline([
    #('vect', DictVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [8]:
test_feachures_dataset = []
test_labels_dataset = []
for sent, labels in zip(dataset['test']['tokens'], dataset['test']['labels']):
    test_feachures_dataset.extend(feachure_extractor(sent))
    test_labels_dataset.extend(labels)

X_test = vec.transform(test_feachures_dataset)
y_test = test_labels_dataset

In [9]:
def benchmark(X_train, X_val, X_test, y_train, y_val,y_test, pipeline, parameters, label=''):
    now = datetime.now()
    date_time = now.strftime("%m-%d-%Y__%H-%M-%S")
    with open('./iterations/' + pipeline[1].__class__.__name__ + '-' + label + '--' + date_time, 'w+') as f:
        grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pipeline.steps])
        print("pipeline:", [name for name, _ in pipeline.steps], file=f)
        print("\n", file=f)
        print(parameters, file=f)
        print("\n", file=f)

        print("parameters:")
        print(parameters)
        t0 = time()
        grid_search.fit(X_train, y_train)
        print("done in %0.3fs" % (time() - t0))
        print()
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best score: %0.3f" % grid_search.best_score_, file=f)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
            print("\t%s: %r" % (param_name, best_parameters[param_name]), file=f)

        print("\n", file=f)

        ### validation report
        print('### Validation report')
        predicted = grid_search.predict(X_val)
        clf_repport = classification_report(y_val, predicted)
        print(clf_repport)
        print(clf_repport, file=f)

        # test report
        print('### Test report')
        predicted_test = grid_search.predict(X_test)
        clf_repport = classification_report(y_test, predicted_test)
        print(clf_repport)
        print(clf_repport, file=f)
        



In [10]:
benchmark(X_train, X_val,X_test, y_train, y_val, y_test, pipelineSGD, parametersSGD)

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (1e-05, 1e-06), 'clf__penalty': ('l2', 'elasticnet'), 'clf__max_iter': (80, 100, 200)}
Fitting 5 folds for each of 48 candidates, totalling 240 fits
done in 67.315s

Best score: 0.998
Best parameters set:
	clf__alpha: 1e-06
	clf__max_iter: 100
	clf__penalty: 'elasticnet'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
### Validation report
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    367659
        True       0.88      0.96      0.92      5131

    accuracy                           1.00    372790
   macro avg       0.94      0.98      0.96    372790
weighted avg       1.00      1.00      1.00    372790

### Test report
              precision    recall  f1-score   support

       False       0.97      1.00      0.98      4542
        True       0.00      0.00      0.00       155

    accuracy    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.1min finished
