<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/traditional_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
%matplotlib inline
import numpy as np
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
import re
import os
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [50]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/dataset/"
labelled_dataset = base_dir + "cleanup_labelled.csv"
unlabelled_dataset = base_dir + "cleanup_unlabelled_u.csv"

Mounted at /content/gdrive


In [0]:
labelled = pd.read_csv(labelled_dataset)
labelled.labels = labelled.labels.str.split('|').apply(lambda x: [int(i) for i in x])
mlb = MultiLabelBinarizer()

data_y = mlb.fit_transform(labelled.labels)
data_x = labelled[['text']].values
stop_words = set(stopwords.words('english'))

In [0]:
# split the data, leave 1/3 out for testing
stratified_split = ShuffleSplit(n_splits=2, test_size=0.33)    
for train_index, test_index in stratified_split.split(data_x, data_y):
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]


#transform matrix of plots into lists to pass to a TfidfVectorizer
train_x = [x[0] for x in x_train.tolist()]
test_x = [x[0] for x in x_test.tolist()]

train_y = [y[0] for y in y_train.tolist()]
test_y = [y[0] for y in y_test.tolist()]

In [0]:
labels = list(range(1,18))

def grid_search(train_x, train_y, test_x, test_y, labels, parameters, pipeline):
    '''Train pipeline, test and print results'''
    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=3, verbose=10)
    grid_search_tune.fit(train_x, train_y)

    print()
    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)
    print()

    # measuring performance on test set
    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)

    print(classification_report(test_y, predictions, target_names=labels))
    print("ROC-AUC:", roc_auc_score(test_y, predictions))

# Naive Bayes

In [83]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
parameters = {
                'tfidf__max_df': (0.25, 0.5, 0.75),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'clf__estimator__alpha': (1e-2, 1e-3)
            }
grid_search(train_x, y_train, test_x, y_test, mlb.classes, parameters, pipeline)

Fitting 2 folds for each of 18 candidates, totalling 36 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    3.5s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    7.2s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   11.2s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   17.0s
[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:   23.2s finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,

  'precision', 'predicted', average, warn_for)


# Support Vector Machine

In [86]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}
grid_search(train_x, y_train, test_x, y_test, mlb.classes, parameters, pipeline)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    4.6s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    8.0s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   12.1s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   16.8s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   23.6s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   29.9s
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:   39.5s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:   47.4s
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:  1.7min finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual

  'precision', 'predicted', average, warn_for)


# Logistic Regression

In [87]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}
grid_search(train_x, y_train, test_x, y_test, mlb.classes, parameters, pipeline)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    2.2s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:   15.0s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:   27.7s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:   49.0s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  1.5min
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done  79 tasks      | elapsed:  2.8min
[Parallel(n_jobs=3)]: Done  92 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:  3.9min finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balan

  'precision', 'predicted', average, warn_for)
