<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/traditional_ml_cross_entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!python -m spacy download en_core_web_lg

In [0]:
import numpy as np
import pandas as pd 
import string
import os

### SKLEARN ###
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, NMF

### NLTK ###
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag


### SPACY ###
#import spacy
#spacy_stop_words = spacy.lang.en.stop_words.STOP_WORDS
#nlp = spacy.load('en_core_web_lg', disable=['ner', 'parser'])

from joblib import dump, load
from pathlib import Path

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"
labelled_dataset = base_dir + "dataset/cleanup_labelled.csv"
CROSS_FOLDS = f"{base_dir}dataset/cross_validation/"
OUTPUT_DIR = f"{base_dir}traditional_ml/"

In [0]:
df = pd.read_csv(labelled_dataset)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

# Preprocessors

In [0]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.stopwords  = spacy_stop_words
        self.punct      = set(string.punctuation)

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        for token in nlp(document):

            # Disregard stopwords
            if token in self.stopwords:
                continue

            # Disregard punctuation
            if all(char in self.punct for char in token.text):
                continue

            # yield lemmatized tokens
            yield token.lemma_

In [0]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.stopwords  = set(sw.words('english'))
        self.punct      = set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        for token, tag in pos_tag(word_tokenize(document)):
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # Disregard stopwords
            if token in self.stopwords:
                continue

            # Disregard punctuation
            if all(char in self.punct for char in token):
                continue

            # yield lemmatized tokens
            lemma = self.lemmatize(token, tag)
            yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)

In [0]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

# Pipeline with preprocessor, vectorizer and model

In [0]:
def run_classifier(train_x, train_y, arch, preprocessor=NLTKPreprocessor()):
    if arch == 'svm':
        clf = OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True,
                                            fit_intercept=True, intercept_scaling=1,
                                            loss='squared_hinge', max_iter=1000,
                                            multi_class='ovr', penalty='l2',
                                            random_state=None, tol=0.0001,
                                            verbose=0))

        word_vectorizer = TfidfVectorizer(binary=False, decode_error='strict',
                    encoding='utf-8', dtype=np.float64,
                    input='content', lowercase=False, max_df=0.25, max_features=None,
                    min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                    smooth_idf=True,
                    stop_words=None,
                    strip_accents=None, sublinear_tf=False,
                    tokenizer=identity, use_idf=True,
                    vocabulary=None) 
        

    elif arch == 'nb':
        clf = OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                                fit_prior=True))

        word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                    dtype=np.float64, encoding='utf-8',
                    input='content', lowercase=False, max_df=0.25, max_features=None,
                    min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                    smooth_idf=True,
                    stop_words=None,
                    strip_accents=None, sublinear_tf=False,
                    tokenizer=identity, use_idf=True,
                    vocabulary=None)
        
    
    elif arch == 'lg':
        clf = OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanced',
                                                     dual=False, fit_intercept=True,
                                                     intercept_scaling=1,
                                                     l1_ratio=None, max_iter=4000,
                                                     multi_class='ovr',
                                                     n_jobs=None, penalty='l2',
                                                     random_state=None,
                                                     solver='sag', tol=0.0001,
                                                     verbose=0, warm_start=False),
                                                     n_jobs=None)

        word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                    dtype=np.float64, encoding='utf-8',
                    input='content', lowercase=False, max_df=0.25, max_features=None,
                    min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                    smooth_idf=True,
                    stop_words=None,
                    strip_accents=None, sublinear_tf=False,
                    tokenizer=identity, use_idf=True,
                    vocabulary=None)
        
    
    elif arch == 'knn':
        clf = KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')
        word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=np.float64, encoding='utf-8',
                input='content', lowercase=False, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                stop_words=None,
                strip_accents=None, sublinear_tf=False,
                tokenizer=identity, use_idf=True,
                vocabulary=None) 
        

    pipe = Pipeline([('preprocessor', preprocessor), ('tfidf', word_vectorizer), ('multilabel', clf)])
    pipe.fit(train_x, train_y)
    return pipe

# Evaluation

In [0]:
def metrics_avg(models_testx_testy, labels_):
    def calc(model, test_x, test_y):
        predictions = model.predict(test_x)
        metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
        metrics_df = pd.DataFrame.from_dict(metrics)
        h = hamming_loss(test_y, predictions)
        roc = roc_auc_score(test_y, predictions, average='micro')
        return metrics_df, h, roc
    
    model_1, test_x_first, test_y_first = models_testx_testy[0]
    metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_1_first)
    n = len(models_testx_testy)
  
    for model, test_x, test_y in models_testx_testy[1:]:
        metrics, h, r = calc(model, test_x, test_y)
        metrics_agg += metrics
        ham += h
        roc += r

    return metrics_agg/n, ham/n, roc/n

# Train the models

In [0]:
mlb = MultiLabelBinarizer()
x = df[['text']].values # text
y = mlb.fit_transform(df.labels) # labels
labels = [str(i) for i in range(1,18)]

In [0]:
arch = 'knn'


models = []
for fold in os.listdir(CROSS_FOLDS):
    # Load predefined indices for train, val and test
    train_index = np.load(f"{CROSS_FOLDS}{fold}/train.npy")
    #val_index = np.load(f"{CROSS_FOLDS}{fold}/val.npy")
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    print(fold)

    # Load train and test texts
    x_train = [t[0] for t in x[train_index].tolist()]
    #x_val = [t[0] for t in x[val_index].tolist()]
    x_test = [t[0] for t in x[test_index].tolist()]

    # Load train and test labels
    y_train, y_test = y[train_index], y[test_index]

    # Fit model on fold data
    model = run_classifier(x_train, y_train, arch=arch)
    models.append((model, x_test, y_test))

    # Save model
    save_dir = Path(f"{OUTPUT_DIR}{arch}/")
    save_dir.mkdir(exist_ok=True)
    file_dir = save_dir/f"{arch}_{fold}.joblib"
    dump(model, file_dir)
print(f"Finished trainin {arch}")

# Evaluate on test

In [0]:
results = metrics_avg(models, labels)

In [129]:
results[2]

0.7635767039814408

# Load and evaluate saved models on test data

In [0]:
loaded_arch = 'nb'
loaded_models = []
for fold in os.listdir(CROSS_FOLDS):
    test_index = np.load(f"{CROSS_FOLDS}{fold}/test.npy")

    x_test = [t[0] for t in x[test_index].tolist()]
    y_test = y[test_index]
    
    load_dir = Path(f"{OUTPUT_DIR}{loaded_arch}/")
    load_dir = load_dir/f"{loaded_arch}_{fold}.joblib"
    
    loaded_model = load(load_dir)
    loaded_models.append((loaded_model, x_test, y_test))
print(f"Finished loading the {loaded_arch} models.")

Finished loading the nb models.


In [0]:
print(f"Obtaining results for {loaded_arch}: ")
loaded_results = metrics_avg(loaded_models, labels)

In [0]:
loaded_results[2]

0.7915714104037355