<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/traditional_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"
labelled_dataset = base_dir + "dataset/cleanup_labelled.csv"
CROSS_FOLDS = f"{base_dir}dataset/cross_validation/"

Mounted at /content/gdrive


In [0]:
labelled = pd.read_csv(labelled_dataset)
labelled.labels = labelled.labels.str.split('|').apply(lambda x: [int(i) for i in x])
mlb = MultiLabelBinarizer()

data_x = labelled[['text']].values
x = np.array([x[0] for x in data_x.tolist()])
y = mlb.fit_transform(labelled.labels)

stop_words = set(stopwords.words('english'))
labels = [str(i) for i in range(1,18)]

In [0]:
splits = []
for fold in os.listdir(CROSS_FOLDS):
    train_index = np.load(f"{CROSS_FOLDS}{fold}/train.npy")
    val_index = np.load(f"{CROSS_FOLDS}{fold}/val.npy")
    splits.append((train_index, val_index))

In [0]:
def grid_search(x, y, parameters, pipeline, splits):
    '''Train pipeline, test and print results'''
    gs = GridSearchCV(pipeline, 
                      parameters, 
                      cv=splits, 
                      n_jobs=5, 
                      verbose=10, 
                      return_train_score=True, 
                      scoring='f1_micro')
    gs.fit(x, y)
    print()
    print("Best parameters set:")
    print(gs.best_estimator_.steps)
    print()
    results = gs.cv_results_
    print(f"Mean train scores: {results['mean_train_score']}")
    print(f"Mean validation scores: {results['mean_test_score']}")

# Naive Bayes

In [69]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
parameters = {
                'tfidf__max_df': (0.25, 0.5, 0.75),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'clf__estimator__alpha': (1e-2, 1e-3)
            }
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    7.7s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   16.5s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:   32.6s
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   44.8s
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.0min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  1.3min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  2.1min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:  2.5min
[Parallel(n_jobs=5)]: Done  90 out of  90 | elapsed:  2.9min finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,

# Support Vector Machine

In [72]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    3.8s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   13.6s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:   31.6s
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   43.4s
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  1.3min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  2.0min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:  2.5min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:  3.0min
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed:  3.6min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  4.3min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed:  5.0min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  5.5min
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed:  6.2min
[Parallel(


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual

# Logistic Regression

In [73]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
    "clf__estimator__multi_class": ['ovr', 'multinomial']
}
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    8.6s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   38.3s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  1.6min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  2.2min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  3.2min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  3.8min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:  5.4min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  6.9min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:  8.9min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed: 10.9min
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed: 12.4min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed: 14.0min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed: 15.6min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 17.6min
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed: 20.3min
[Parallel(


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balan

# Trees

In [74]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', DecisionTreeClassifier()),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:   19.6s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  3.4min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  4.6min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  6.9min
[Parallel(n_jobs=5)]: Done  41 out of  45 | elapsed:  9.9min remaining:   58.0s
[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed: 10.0min finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=Non

# KNN

In [75]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', KNeighborsClassifier()),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__n_neighbors': (2,3,4,5),
    'clf__weights': ('uniform', 'distance'),
    'clf__metric': ['minkowski'],
    'clf__algorithm': ('ball_tree', 'kd_tree', 'brute')
}
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:   23.9s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   59.0s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  1.6min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  2.4min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  3.3min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:  4.8min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  5.3min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:  6.0min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:  6.7min
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed:  8.1min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  9.5min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed: 11.0min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 11.8min
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed: 12.7min
[Parallel(


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowsk

