<a href="https://colab.research.google.com/github/tobiasSviderski/assigmentDataScienceProduct/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CETM47 Assigment 2 
from sklearn.datasets import fetch_20newsgroups # Download the dataset

# Word stop and lemming packages
import nltk
import string 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Pipeline packages
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Preprocessing for MultinomialNB model with GensimWord2VecVectorizer
from sklearn.preprocessing import MinMaxScaler
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss

# Needed files for it to work
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
import re


class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Word vectors are averaged across to create the document-level vectors/features.

    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we roll out our own.

    All the parameters are gensim.models.Word2Vec's parameters.

    https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    """

    def __init__(self, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
                trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                callbacks=(), max_final_vocab=None, stopwords_action=None):

        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.epochs = epochs
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab
        self.stopwords_action = stopwords_action

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        lower_words = words.lower() # Return lowercase string
        no_special_words = re.sub('[^.,\-\'!?a-zA-Z0-9 \n\.]', '', lower_words) # Remove special characters
        # If stopwords are true
        if self.stopwords_action is not None:
          no_stopwords_words = [word for word in no_special_words if not word in self.stopwords_action]
          no_special_words = ("").join(no_stopwords_words)

        print("no special words", no_special_words)
        valid_words = [word for word in no_special_words if word in self.model_.wv.key_to_index]
        print(valid_words)

        import pdb
        pdb.set_trace()
        if valid_words:
            embedding = np.zeros((len(valid_words), self.vector_size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.vector_size)

In [None]:
#Load the datasets
#Loading the train set
newsgroup_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

#Loading the test set
newsgroup_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

In [None]:
#Overview of the steps
#Running basic pipeline without anything
#Running pipeline with noise removal and lower casing
#Running pipeline with lemming and stop words
#Running pipeline with word2vec embedding

# Options that the model would be trying
use_noise_lower = "noise_lower"
use_lemming = "lemming"
use_stopwords = "stopwords"

# Get every combination of the options
import itertools
combinations = [use_noise_lower, use_lemming, use_stopwords]
every_combination = [list(zip(combinations, x)) for x in itertools.product([True, False], repeat=len(combinations))] #Generate all combinations

# Classifiers to try
classifiers = [
    MultinomialNB(alpha=0.1),
    # DecisionTreeClassifier(),
    # RandomForestClassifier(),
    # Add the MLP one 
]

In [None]:
def runModel(classifier):
    print(classifier)
    for options in every_combination:
        lemming_value = next(x for x in options if x[0] == use_lemming)[1]
        noise_lower_value = next(x for x in options if x[0] == use_noise_lower)[1]
        stopwords_value = next(x for x in options if x[0] == use_stopwords)[1]
        print(options)
        runPipelineNormal(classifier, lemming_value, noise_lower_value, stopwords_value)

    #Run pipeline with word2vec
    print("Running pipeline with word2vec")
    runPipelineWordEmbeddings(classifier)

In [None]:
def runPipelineNormal(classifier, lemming_value, noise_lower_value, stopwords_value):
    # Create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier),
    ])

    ## Lemming logic
    def tokenize(text):
        # Remove punctuation
        text = ''.join([ch for ch in text if ch not in string.punctuation])
        tokens = word_tokenize(text) # Get tokenize text

        if stopwords_value & lemming_value: 
        # Remove stopwords
            tokens = [w for w in tokens if w not in stopwords.words('english')]

        # If lemming is true, lemmatize the words
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(token) for token in tokens]

    if lemming_value: 
        pipeline.set_params(vect__tokenizer=tokenize)

    # If stopwords is true and lemming is false, remove stopwords
    if stopwords_value and not lemming_value:
        pipeline.set_params(vect__stop_words=stopwords.words('english'))

    # If noise_lower is true, remove noise
    if noise_lower_value:
        pipeline.set_params(vect__ngram_range=(1, 2))


    # Train the model
    pipeline.fit(newsgroup_train.data, newsgroup_train.target)

    # Evaluate the model on the test set
    ## Get the accuracy
    predicted_accuracy = accuracy_score(newsgroup_test.target, pipeline.predict(newsgroup_test.data))
    ## Get the log loss
    predicted_log_loss = log_loss(newsgroup_test.target, pipeline.predict_proba(newsgroup_test.data))

    print("Accuracy: {}".format(predicted_accuracy))
    print("Log Loss: {}".format(predicted_log_loss))
    ## Append the results
    return{
        'accuracy': predicted_accuracy,
        'log_loss': predicted_log_loss,
        # 'confusion_matrix': predicted_confusion_matrix,
        # 'classification_report': predicted_classification_report
    }

In [None]:
def runPipelineWordEmbeddings(classifier):
    pipeline = Pipeline([
        ('w2v', GensimWord2VecVectorizer(vector_size=50, min_count=3, sg=1, 
                                         alpha=0.025)),
        ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
        ('clf', classifier),
    ])

      # if stopwords_value and not lemming_value:
    pipeline.set_params(w2v__stopwords_action=stopwords.words('english'))

    # Train the model
    pipeline.fit(newsgroup_train.data, newsgroup_train.target)

        # Train the model
    pipeline.fit(newsgroup_train.data, newsgroup_train.target)

    # Evaluate the model on the test set
    ## Get the accuracy
    predicted_accuracy = accuracy_score(newsgroup_test.target, pipeline.predict(newsgroup_test.data))
    ## Get the log loss
    predicted_log_loss = log_loss(newsgroup_test.target, pipeline.predict_proba(newsgroup_test.data))

    print("Accuracy: {}".format(predicted_accuracy))
    print("Log Loss: {}".format(predicted_log_loss))

In [None]:
# for classifier in classifiers:
#     runModel(classifier)

runPipelineWordEmbeddings(classifiers[0])

no special words  w wnerng f nne u here cul enlghen e n h cr  w
he her .  w  2-r pr cr, lke  be fr he le 60
erl 70.  w clle  brckln. he r were rell ll. n n,
he frn buper w epre fr he re f he b. h  
ll  knw. f nne cn elle  el ne, engne pec, er
f prucn, where h cr  e, hr, r whever nf u
hve n h funk lkng cr, plee e-l.
[' ', 'w', ' ', 'w', 'n', 'e', 'r', 'n', 'g', ' ', 'f', ' ', 'n', 'n', 'e', ' ', 'u', ' ', 'h', 'e', 'r', 'e', ' ', 'c', 'u', 'l', ' ', 'e', 'n', 'l', 'g', 'h', 'e', 'n', ' ', 'e', ' ', 'n', ' ', 'h', ' ', 'c', 'r', ' ', ' ', 'w', '\n', 'h', 'e', ' ', 'h', 'e', 'r', ' ', '.', ' ', ' ', 'w', ' ', ' ', '2', '-', 'r', ' ', 'p', 'r', ' ', 'c', 'r', ',', ' ', 'l', 'k', 'e', ' ', ' ', 'b', 'e', ' ', 'f', 'r', ' ', 'h', 'e', ' ', 'l', 'e', ' ', '6', '0', '\n', 'e', 'r', 'l', ' ', '7', '0', '.', ' ', ' ', 'w', ' ', 'c', 'l', 'l', 'e', ' ', ' ', 'b', 'r', 'c', 'k', 'l', 'n', '.', ' ', 'h', 'e', ' ', 'r', ' ', 'w', 'e', 'r', 'e', ' ', 'r', 'e', 'l', 'l', ' ', 'l', 'l', '.', ' ', 'n', 

BdbQuit: ignored