<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/traditional_ml_cross_entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!pip install iterative-stratification
#!python -m spacy download en_core_web_lg

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6
Collecting en_core_web_lg==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz#egg=en_core_web_lg==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)
[K     |████████████████████████████████| 826.9MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=828255076 sha256=271eedc8c6419b2fa8c81dab6582a6fef5da79e006a303b6c52371cde8ad95dd
  Stored in d

In [33]:
%matplotlib inline
import numpy as np
import pandas as pd 
import string

### SKLEARN ###
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, hamming_loss, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

### NLTK ###
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

### SPACY ###
import spacy
nlp = spacy.load('en_core_web_lg', disable=['ner', 'parser'])
spacy_stop_words = spacy.lang.en.stop_words.STOP_WORDS

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/dataset/"
labelled_dataset = base_dir + "cleanup_labelled.csv"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
df = pd.read_csv(labelled_dataset)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

In [0]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.stopwords  = spacy_stop_words
        self.punct      = set(string.punctuation)

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        for token in nlp(document):

            # Disregard stopwords
            if token in self.stopwords:
                continue

            # Disregard punctuation
            if all(char in self.punct for char in token.text):
                continue

            # yield lemmatized tokens
            yield token.lemma_

In [0]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.stopwords  = set(sw.words('english'))
        self.punct      = set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        for token, tag in pos_tag(word_tokenize(document)):
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # Disregard stopwords
            if token in self.stopwords:
                continue

            # Disregard punctuation
            if all(char in self.punct for char in token):
                continue

            # yield lemmatized tokens
            lemma = self.lemmatize(token, tag)
            yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)

In [0]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

In [0]:
def run_classifier(train_x, train_y, type_, preprocessor=NLTKPreprocessor()):
  if type_ == 'svm':
    clf = OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0))
    
    word_vectorizer = TfidfVectorizer(binary=False, decode_error='strict',
                encoding='utf-8', dtype=np.float64,
                input='content', lowercase=False, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=None,
                strip_accents=None, sublinear_tf=False,
                tokenizer=identity, use_idf=True,
                vocabulary=None)

  elif type_ == 'nb':
    clf = OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                            fit_prior=True))
                          
    word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=np.float64, encoding='utf-8',
                input='content', lowercase=False, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=None,
                strip_accents=None, sublinear_tf=False,
                tokenizer=identity, use_idf=True,
                vocabulary=None)
    
  elif type_ == 'lg':
    clf = OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=2000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='sag', tol=0.0001,
                                                 verbose=0, warm_start=False),
                                                 n_jobs=None)
    
    word_vectorizer = TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=np.float64, encoding='utf-8',
                input='content', lowercase=False, max_df=0.25, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=None,
                strip_accents=None, sublinear_tf=False,
                tokenizer=identity, use_idf=True,
                vocabulary=None)
    
  pipe = Pipeline([('preprocessor', preprocessor), ('tfidf', word_vectorizer), ('multilabel', clf)])
  pipe.fit(train_x, train_y)
  return pipe

In [0]:
def metrics_avg(models_testx_testy, labels_):
  def calc(model, test_x, test_y):
    predictions = model.predict(test_x)
    metrics = classification_report(test_y, predictions, target_names=labels_, output_dict=True)
    metrics_df = pd.DataFrame.from_dict(metrics)
    h = hamming_loss(test_y, predictions)
    roc = roc_auc_score(test_y, predictions, average='micro')
    return metrics_df, h, roc
    
  model_1, test_x_1, test_y_1 = models_testx_testy[0]
  metrics_agg, ham, roc = calc(model_1, test_x_1, test_y_1)
  n = len(models_testx_testy)
  
  for model, test_x, test_y_1 in models_testx_testy[1:]:
    metrics, h, r = calc(model, test_x, test_y_1)
    metrics_agg += metrics
    ham += h
    roc += r
  
  return metrics_agg/n, ham/n, roc/n

In [112]:
mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
mlb = MultiLabelBinarizer()
results = []
x = df[['text']].values # text
y = mlb.fit_transform(df.labels) # labels
count = 0
labels = [str(i) for i in range(1,18)]

for train_index, test_index in mskf.split(x, y):
  count += 1
  print(f"Fold no. {count}")

  x_train, x_test = [t[0] for t in x[train_index].tolist()], [t[0] for t in x[test_index].tolist()]
  y_train, y_test = y[train_index], y[test_index]
  
  model =  run_classifier(x_train, y_train, 'lg')
  results.append((model, x_test, y_test))

Fold no. 1




Fold no. 2




Fold no. 3




Fold no. 4




Fold no. 5




Fold no. 6




Fold no. 7




Fold no. 8




Fold no. 9




Fold no. 10




In [113]:
avg_results = metrics_avg(results, labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [116]:
avg_results[2]

0.8823201752913954