In [1]:
import os

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from gensim.models import KeyedVectors
from sklearn.base import TransformerMixin
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics.classification import accuracy_score

from textvec import vectorizers

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Load IMDB data

In [3]:
def get_binary_imdb_data(base_dir, num_files=5000):
    def get_df(corpus_type):
        texts = []
        y = []
        corpus_type_dir = os.path.join(base_dir, f'{corpus_type}/pos/')
        for i, f_name in enumerate(os.listdir(corpus_type_dir)):
            if i == num_files: break
            f_path = os.path.join(corpus_type_dir, f_name)
            with open(f_path) as f:
                texts.append(f.read())
                y.append(1)
        corpus_type_dir = os.path.join(base_dir, f'{corpus_type}/neg/')
        for i, f_name in enumerate(os.listdir(corpus_type_dir)):
            if i == num_files: break
            f_path = os.path.join(corpus_type_dir, f_name)
            with open(f_path) as f:
                texts.append(f.read())
                y.append(0)
        df = pd.DataFrame()
        df['y'] = y
        df['text'] = texts
        return df
    train_data = shuffle(get_df('train'))
    test_data = shuffle(get_df('test'))
    return train_data, test_data

In [4]:
train, test = get_binary_imdb_data("/home/enio/Загрузки/data/aclImdb", num_files=5000)

## Tokenize text

In [5]:
tokenizer = RegexpTokenizer(r"[A-Za-z]\w+")
train_tokenized = [tokenizer.tokenize(doc) for doc in train.text]
test_tokenized = [tokenizer.tokenize(doc) for doc in test.text]

# Load Embeddings

In [6]:
embeddings = KeyedVectors.load_word2vec_format(
    "/home/enio/ds/embeddings/word2vec/GoogleNews-vectors-negative300.bin.gz", 
    binary=True
)

All word vectors should be scaled to **l2 norm**:

In [7]:
embeddings.init_sims(replace=True)

# Create pipeline
You can use `SifVectorizer` in the Pipeline like other vectorizers:

In [8]:
pipeline = Pipeline([
    ("vectorizer", vectorizers.SifVectorizer(embeddings, alpha=0.1, npc=1)),
    ("clf", RandomForestClassifier(n_jobs=-1, n_estimators=600))
])

pipeline.fit(train_tokenized, train.y)
preds = pipeline.predict(test_tokenized)
accuracy = accuracy_score(test.y, preds)
print(f"test accuracy: {accuracy}")

test accuracy: 0.8032


# Advanced pipeline usage
You can also create your own Tokenizer class to make working with the `Pipeline` class even more convenient:

In [9]:
class Tokenizer(TransformerMixin):
    def __init__(self, token_pattern):
        self.token_pattern = token_pattern
        self.tokenizer = None
        
    def fit(self, X, y=None):
        self.tokenizer = RegexpTokenizer(self.token_pattern)
        return self
    
    def transform(self, X):
        return [
            self.tokenizer.tokenize(sent)
            for sent in X
        ]

In [10]:
pipeline = Pipeline([
    ("tokenizer", Tokenizer(r"[A-Za-z]\w+")),
    ("vectorizer", vectorizers.SifVectorizer(embeddings, alpha=0.1, npc=1)),
    ("clf", RandomForestClassifier(n_jobs=-1, n_estimators=600))
])

# note that here `fit` function takes
# non-tokenized text
pipeline.fit(train.text, train.y)
preds = pipeline.predict(test.text)
accuracy = accuracy_score(test.y, preds)
print(f"test accuracy: {accuracy}")

test accuracy: 0.8046
