<h1><center>This is a notebook, which illustrates the usage of ukrainina_from_russian module</center></h1>

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cdist
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from pymystem3 import Mystem
import plotly.express as px
from time import time
from IPython.display import clear_output
from sys import getsizeof

import warnings
warnings.filterwarnings('ignore')

UKR_CHANNELS = [
    'Труха⚡️Украина', 'Лачен пишет', 'Украинская правда. Главное',
    'Вы хотите как на Украине?', 'Борис Філатов', 'RAGNAROCK PRIVET',
    'УНИАН - новости Украины | война с Россией | новини України | війна з Росією',
    'Украина 24/7 Новости | Война | Новини', 'Быть Или',
    'Украина Сейчас: новости, война, Россия'
]

UKR_LETTERS = ['ї', 'є', 'ґ', 'і']

CHEAT_WORDS = [
    '03', '04', '05', '1378', '2022', '3801', '3806', '4149', '4276',
    '4279', '9521', '9842', 'akimapachev', 'amp', 'anna', 'com',
    'daily', 'diza', 'donbass', 'epoddubny', 'https', 'index', 'me',
    'news', 'opersvodki', 'pravda', 'rus', 'rvvoenkor', 'sashakots',
    'ua', 'wargonzo', 'www', 'www pravda', 'мид', 'труха', 'труха украина',
    'украина сейчас', 'pravda com', 'daily news', 'com ua', 'https www',
    'me rvvoenkor', 'rus news', 'ua rus', 'wargonzo наш'
]

def time_decorator(function):
    from time import time
    def inner(*args, **kwargs):
        start = time()
        result = function(*args, **kwargs)
        elapsed_time = round(time() - start, 2)
        output = f'{function.__name__} took {elapsed_time} seconds.'
        print(output)
        return result
    return inner

class Preprocessor:
    
    def __init__(self, data=None):
        """
        A class for the preprocessing purposes. Main methods icnludes:
        reading, cleaning, lemmatizing and vectorizing the data.
        """
        self.data = data
        self.lemmas = None
        self.X_train = None
        self.X_test = None
        self.ukr_train = None
        self.ukr_test = None
        self.channel_train = None
        self.channel_test = None
        self.percent_ukr = 0
        self.percent_rus = 1
        self.lemmatized = False
        self.vectorized = False
        self.cheat_words = CHEAT_WORDS
    
    @time_decorator
    def read_data(self, filename='random_msgs.csv', sep='¶∆',
                  header=None):
        """
        Reads the csv file into 4 columns:
        channel
        date of publication
        message
        ukrainian - 1 if ukrainian channel, 0 - otherwise.
        """
        if self.data is None:
            self.data = pd.read_csv(filename, sep=sep, header=header)
            self.data.columns = ['channel', 'date', 'msg']
            self.data['ukrainian'] = self.data['channel'].\
            apply(lambda x: 1 if x in UKR_CHANNELS else 0)
            self.data['ukrainian'] = self.data['ukrainian'].astype('int8')
            self.data = self.data[self.data['channel'] != 'вечеряємо']
            self.percent_ukr = self.data['ukrainian'].mean()
            self.percent_rus = 1 - self.percent_ukr
    
    def get_data(self):
        """
        Method to get the df.
        """
        return self.data
    
    def get_percents_ukr_rus(self):
        """
        Method to get the percentage of ukrainian and russian messages among
        the dataset.
        """
        return self.percent_ukr, self.percent_rus
    
    @time_decorator
    def preprocess(self, remove_ukr_msgs=True, cut_less_than=18):
        """
        This method:
        removes short messages (with less than 18 characters);
        removes messages with ukrainian letters.
        """
        if remove_ukr_msgs:
            for letter in UKR_LETTERS:
                self.data = self.data[self.data['msg'].str.lower().\
                                        str.contains(letter) == False]
        self.data = self.data[self.data['msg'].str.len() > cut_less_than]
        self.data = self.data.reset_index(drop=True)
        self.percent_ukr = self.data['ukrainian'].mean()
        self.percent_rus = 1 - self.percent_ukr
    
    @time_decorator
    def lemmatize(self, *sentences):
        """
        This method has 2 usages:
        internal; i.e. to lemmatize all messages in the dataset. Runs about 2.5
        minutes.
        outside; to lemmatize a given sequence of sentences.
        """
        mystem = Mystem()
        if not sentences:
            if not self.lemmatized:
                def preprocess_text(text):
                    tokens = mystem.lemmatize(text.lower())
                    text = " ".join(tokens)
                    return text

                self.data['msg'] = self.data['msg'].apply(preprocess_text)
                self.lemmas = self.data['msg'].copy()
                self.lemmatized = True
        else:
            result = []
            for sentence in sentences:
                tokens = mystem.lemmatize(sentence.lower())
                result.append(' '.join(tokens))
            return result
    
    def get_lemmas(self):
        """
        Method to get lemmatized messages.
        """
        return self.lemmas
    
    def train_test_split(self, random_state=1, train_size=.8):
        """
        This method clones scikit-learn train_test_split.
        """
        self.X_train, self.X_test, self.ukr_train, self.ukr_test,\
        self.channel_train, self.channel_test = \
        train_test_split(
            self.data['msg'], self.data['ukrainian'], self.data['channel'],
            random_state=random_state, train_size=train_size
        )
    
    def get_train_test_split(self):
        """
        Returns the train and test part.
        """
        return self.X_train, self.X_test, self.ukr_train, self.ukr_test,\
        self.channel_train, self.channel_test
    
    @time_decorator
    def vectorize(self, ngram_range=(1,1), sublinear_tf=True, binary=False):
        """
        This method creates a pipeline of CountVectorizer() and TfidfTransformer().
        If CountVectorizer is needed - use count_transform method.
        If TfidfVectorizer is needed - just call a tfidf_transform method.
        """
        try:
            if not self.vectorized:
                self.tfidf = Pipeline([
                    ('vect', CountVectorizer(binary=binary, ngram_range=ngram_range)),
                    ('tfidf', TfidfTransformer(sublinear_tf=sublinear_tf))
                ]).fit(self.X_train)
                self.vect = self.tfidf['vect']
                self.vectorized = True
        except TypeError:
            print("You didn't initialize data or train_test_split.")
        
    
    def get_vectorizer(self, tfidf=True):
        """
        Returns the actual vectorizer.
        """
        return self.vectorizer
    
    @time_decorator
    def tfidf_transform(self):
        """
        Applies TfidfTransform to data.
        """
        try:
            self.vectorizer = self.tfidf
            X_train = self.X_train = self.vectorizer.transform(self.X_train).T
            X_test =  self.X_test = self.vectorizer.transform(self.X_test).T
            return X_train, X_test
        except AttributeError:
            print("You didn't initialize read_data, train_test_split or vectorize.")
    
    @time_decorator
    def count_transform(self):
        """
        Applies CountTransform to data.
        """
        try:
            self.vectorizer = self.vect
            X_train = self.X_train = self.vectorizer.transform(self.X_train).asfptype().T
            X_test = self.X_test = self.vectorizer.transform(self.X_test).asfptype().T
            return X_train, X_test
        except AttributeError:
            print("You didn't initialize read_data, train_test_split or vectorize.")
    
    @time_decorator
    def remove_cheat_words(self, method='manual', freq_pivot=.5,
                           cheat_words=CHEAT_WORDS):
        """
        Removes cheat_words, like channel tags, social media links or
        authors names.
        """
        if method == 'manual':
            delete_mask = np.zeros(self.X_train.shape[0], dtype=bool)
            delete_mask[np.isin(np.array(
                    self.vectorizer.get_feature_names_out()), cheat_words)
            ] = True
            self.X_train = self.X_train.T[:, ~delete_mask].T
            self.X_test = self.X_test.T[:, ~delete_mask].T
            self.delete_mask = delete_mask
            self.cheat_words = np.array(
                self.vectorizer.get_feature_names_out()
            ).T[delete_mask]
        else:
            delete_mask = np.zeros(self.X_train.shape[0], dtype=bool)
            for channel in self.channel_trainchannel_train.unique():
                arr = self.X_train.T[self.channel_train == channel]
                delete_mask |= np.array((np.sum(arr > 0, axis=0) / arr.shape[0]) > .5)[0]

            self.X_train = self.X_train.T[:, ~delete_mask].T
            self.X_test = self.X_test.T[:, ~delete_mask].T
            self.delete_mask = delete_mask
            self.cheat_words = np.array(
                self.vectorizer.get_feature_names_out()
            ).T[delete_mask]

    def get_cheat_words(self):
        """
        Returns the deleted cheat_words.
        """
        return self.cheat_words
    
    def get_delete_mask(self):
        """
        Returns the mask of cheat_words, which can be applied onto vectorizer matrix.
        """
        return self.delete_mask

class Predictor:
    
    def __init__(self, SVD=[None, None, None]):
        self.Terms, self.S, self.Documents = SVD
        if not self.S:
            self.calculated_svd = False
        else:
            self.calculated_svd = True
    
    def get_SVD(self):
        if self.calculated_svd:
            return self.Terms, self.S, self.Documents
        return 'You need to calculate SVD first'
    
    @time_decorator
    def train_LSA(self, X_train, ukr_train, k=150):
        if not self.calculated_svd:
            self.Terms, self.S, self.Documents = svds(X_train, k=k)
            self.ukr_centre = np.array([np.mean(self.Documents.T[ukr_train == 1], axis=0)])
            self.rus_centre = np.array([np.mean(self.Documents.T[ukr_train == 0], axis=0)])
            self.calculated_svd = True
    
    @time_decorator
    def predict_LSA(self, X_pred):
        Documents_pred = np.diag(1 / self.S) @ self.Terms.T @ X_pred
        dist_to_ukr = cdist(self.ukr_centre, Documents_pred.T, metric='euclidean')[0]
        dist_to_rus = cdist(self.rus_centre, Documents_pred.T, metric='euclidean')[0]
        ukr_pred = self.ukr_pred = np.array([dist_to_ukr < dist_to_rus]).reshape((-1, 1))
        return ukr_pred
    
    def evaluate(self, ukr_test):
        ukr_test = np.array(ukr_test).astype(bool).reshape((-1, 1))
        self.accuracy = round(100 * np.sum(self.ukr_pred == ukr_test) / len(ukr_test), 2)
        return self.accuracy
    
    def train_NBC(self, X_train, ukr_train, percent_ukr=0):
        '''
        Need to use CountVectorizer(binary=True) for this one.
        '''
        self.terms_prob_ukr = np.mean(X_train.T[ukr_train == 1], axis=0)
        self.terms_prob_rus = np.mean(X_train.T[ukr_train == 0], axis=0)
        self.percent_ukr = percent_ukr
        self.percent_rus = 1 - percent_ukr
    
    def predict_NBC(self, X_pred):
        self.ukr_prob = self.percent_ukr * X_pred.T * self.terms_prob_ukr.T
        self.rus_prob = self.percent_rus * X_pred.T * self.terms_prob_rus.T
        ukr_pred = self.ukr_pred = self.ukr_prob > self.rus_prob

    def train_LR(self, X_train, ukr_train):
        self.logistic_regression = LogisticRegression(random_state=1).fit(X_train.T, ukr_train)
    
    def predict_LR(self, X_pred):
        ukr_pred = self.ukr_pred = np.array([self.logistic_regression.predict(X_pred.T)]).reshape((-1, 1))

# An example of using Preprocessor

In [2]:
preprocessor = Preprocessor()
preprocessor.read_data()
preprocessor.get_data().sample(5)

read_data took 0.61 seconds.


Unnamed: 0,channel,date,msg,ukrainian
73112,Украинская правда. Главное,2022-05-15 14:45:48+00:00,Эвакуационный автомобиль с детьми попал под об...,1
68963,Украина 24/7 Новости | Война | Новини,2022-03-26 19:55:59+00:00,"❗Мэр Славутича сообщил, что рашисты оккупирова...",1
103982,Пул N3,2022-05-04 12:07:13+00:00,Украинские маркетологи и политтехнологи очевид...,0
38649,УНИАН - новости Украины | война с Россией | но...,2022-05-16 07:06:53+00:00,"🔥Харьков оживает В понедельник, 16 мая, в пос...",1
94986,РИА Новости,2022-03-23 12:23:52+00:00,⚡️Путин: принято решение в кратчайшие сроки пе...,0


In [3]:
preprocessor.preprocess()
preprocessor.get_data().shape

preprocess took 1.04 seconds.


(138059, 4)

be carefull; this line runs approx. 2:30 minutes.

In [4]:
preprocessor.lemmatize()

lemmatize took 146.94 seconds.


In [5]:
preprocessor.train_test_split()
preprocessor.vectorize(ngram_range=(1,2), binary=True, sublinear_tf=True)
X_train, X_test = preprocessor.tfidf_transform()
X_train.shape, X_test.shape

vectorize took 7.82 seconds.
tfidf_transform took 6.4 seconds.


((1434042, 110447), (1434042, 27612))

or preprocessor.count_transform() if CountVectorizer model is needed.

In [6]:
preprocessor.remove_cheat_words()
mask = preprocessor.get_delete_mask()
preprocessor.get_cheat_words()

remove_cheat_words took 3.23 seconds.


array(['03', '04', '05', '1378', '2022', '3801', '3806', '4149', '4276',
       '4279', '9521', '9842', 'akimapachev', 'amp', 'anna', 'com',
       'com ua', 'daily', 'daily news', 'diza', 'donbass', 'epoddubny',
       'https', 'https www', 'index', 'me', 'me rvvoenkor', 'news',
       'opersvodki', 'pravda', 'pravda com', 'rus', 'rus news',
       'rvvoenkor', 'sashakots', 'ua', 'ua rus', 'wargonzo',
       'wargonzo наш', 'www', 'www pravda', 'мид', 'труха',
       'труха украина', 'украина сейчас'], dtype=object)

In [7]:
X_train, X_test, ukr_train, ukr_test, channel_train, channel_test = \
preprocessor.get_train_test_split()
X_train.shape, X_test.shape, ukr_train.shape, ukr_test.shape, channel_train.shape, channel_test.shape

((1433997, 110447), (1433997, 27612), (110447,), (27612,), (110447,), (27612,))

# An example of using Predictor

Now, let's say we want to predict whether the sentence belongs to ukrainian social media or russian.

In [8]:
sentence = 'Все пленные с "Азовстали" содержатся в ДНР, \
их будет судить трибунал на территории республики — глава ДНР Денис Пушилин.'

Firstly, need to lemmatiza and transform our sentence.

In [9]:
lemmatized = preprocessor.lemmatize(sentence)
vectorizer = preprocessor.get_vectorizer()
transformed = vectorizer.transform(lemmatized)[:, ~mask].asfptype().T
transformed.shape

lemmatize took 0.62 seconds.


(1433997, 1)

be carefull; this line runs approx. 1 to 60 minutes, depending on k.

In [10]:
predictor = Predictor()

In [11]:
predictor.train_LSA(X_train, ukr_train, k=300)
svd = predictor.get_SVD()

train_LSA took 138.37 seconds.


In [12]:
pred = predictor.predict_LSA(transformed)[0][0]
pred

predict_LSA took 126.8 seconds.


False

Result is False := **russian**.

# Finding the best parameters for 3 models

In [2]:
preprocessor = Preprocessor()
preprocessor.read_data()
preprocessor.lemmatize()
DATA = preprocessor.get_data()

read_data took 0.67 seconds.
lemmatize took 154.41 seconds.


## Latent Sematic Analysis

We have 2 options for vectorizing method, namely:

* CountVectorizer() a.k.a Bag-of-words
* TfidfVectorizer() - term-frequency inverse document frequency.

In the first cell I'll find the best parameters for BOW.

In [3]:
for ngram_range in [(1, 1), (1, 2)]:
    for binary in [True, False]:
        preprocessor = Preprocessor(DATA)
        preprocessor.train_test_split()
        preprocessor.vectorize(ngram_range=ngram_range, binary=binary, sublinear_tf=True)    
        preprocessor.count_transform()
        preprocessor.remove_cheat_words()
        X_train, X_test, ukr_train, ukr_test, channel_train, channel_test = \
        preprocessor.get_train_test_split()

        predictor = Predictor()
        predictor.train_LSA(X_train, ukr_train, k=500)
        ukr_pred = predictor.predict_LSA(X_test)
        accuracy = predictor.evaluate(ukr_test)
        print('-----------------------------------------')
        print(f'ngram={ngram_range}, binary={binary}, accuracy: {accuracy}%')
        print('-----------------------------------------')

vectorize took 2.46 seconds.
count_transform took 2.9 seconds.
remove_cheat_words took 0.13 seconds.
train_LSA took 80.66 seconds.
predict_LSA took 0.72 seconds.
-----------------------------------------
ngram=(1, 1), binary=True, accuracy: 80.05%
-----------------------------------------
vectorize took 2.46 seconds.
count_transform took 2.96 seconds.
remove_cheat_words took 0.13 seconds.
train_LSA took 79.89 seconds.
predict_LSA took 0.66 seconds.
-----------------------------------------
ngram=(1, 1), binary=False, accuracy: 79.2%
-----------------------------------------
vectorize took 8.02 seconds.
count_transform took 6.38 seconds.
remove_cheat_words took 3.61 seconds.
train_LSA took 272.23 seconds.
predict_LSA took 36.16 seconds.
-----------------------------------------
ngram=(1, 2), binary=True, accuracy: 81.72%
-----------------------------------------
vectorize took 8.1 seconds.
count_transform took 6.33 seconds.
remove_cheat_words took 3.51 seconds.
train_LSA took 296.86 sec

In the following cell, I'll run the same code, but with Tfidf.

In [6]:
for cut_less_than in [0, 18, 100, 500, 1000]:
    preprocessor = Preprocessor(DATA)
    preprocessor.preprocess(cut_less_than=cut_less_than)
    preprocessor.train_test_split()
    preprocessor.vectorize(ngram_range=(1,2), binary=True, sublinear_tf=True)    
    preprocessor.tfidf_transform()
    preprocessor.remove_cheat_words()
    X_train, X_test, ukr_train, ukr_test, channel_train, channel_test = \
    preprocessor.get_train_test_split()

    predictor = Predictor()
    predictor.train_LSA(X_train, ukr_train, k=500)
    ukr_pred = predictor.predict_LSA(X_test)
    accuracy = predictor.evaluate(ukr_test)
    print('-----------------------------------------')
    print(f'cut_less_than={cut_less_than}, accuracy: {accuracy}%')
    print('-----------------------------------------')

preprocess took 1.31 seconds.
vectorize took 7.86 seconds.
tfidf_transform took 6.32 seconds.
remove_cheat_words took 3.48 seconds.
train_LSA took 284.68 seconds.
predict_LSA took 22.25 seconds.
-----------------------------------------
cut_less_than=0, accuracy: 83.58%
-----------------------------------------
preprocess took 1.35 seconds.
vectorize took 7.55 seconds.
tfidf_transform took 6.38 seconds.
remove_cheat_words took 3.76 seconds.
train_LSA took 266.77 seconds.
predict_LSA took 18.34 seconds.
-----------------------------------------
cut_less_than=18, accuracy: 83.68%
-----------------------------------------
preprocess took 1.36 seconds.
vectorize took 7.44 seconds.
tfidf_transform took 6.3 seconds.
remove_cheat_words took 3.74 seconds.
train_LSA took 255.81 seconds.
predict_LSA took 21.96 seconds.
-----------------------------------------
cut_less_than=100, accuracy: 85.91%
-----------------------------------------
preprocess took 1.38 seconds.
vectorize took 4.73 seconds.


## Naive Bayes Classifier

Naive Bayes Classifier held the worst results (a few percents better than non-negative matrix factorization or k-nearest neighbors), but here are the parameters for this model.

In [9]:
for ngram_range in [(1, 1), (1, 2)]:
    preprocessor = Preprocessor(DATA)
    preprocessor.train_test_split()
    preprocessor.vectorize(ngram_range=ngram_range, binary=True, sublinear_tf=True)    
    preprocessor.count_transform()
    preprocessor.remove_cheat_words()
    X_train, X_test, ukr_train, ukr_test, channel_train, channel_test = \
    preprocessor.get_train_test_split()

    predictor = Predictor()
    predictor.train_NBC(X_train, ukr_train)
    ukr_pred = predictor.predict_NBC(X_test)
    accuracy = predictor.evaluate(ukr_test)
    print('-----------------------------------------')
    print(f'ngram={ngram_range}, binary={True}, accuracy: {accuracy}%')
    print('-----------------------------------------')

vectorize took 2.46 seconds.
count_transform took 2.92 seconds.
remove_cheat_words took 0.14 seconds.
-----------------------------------------
ngram=(1, 1), binary=True, accuracy: 56.88%
-----------------------------------------
vectorize took 8.02 seconds.
count_transform took 6.65 seconds.
remove_cheat_words took 3.44 seconds.
-----------------------------------------
ngram=(1, 2), binary=True, accuracy: 56.88%
-----------------------------------------


## Logistic Regression

The same as with LSA model, here we'll test 2 situations:

* for CountVectorizer
* for TfidfVectorizer

In [7]:
for cut_less_than in [0, 18, 100, 500, 1000]:
    preprocessor = Preprocessor(DATA)
    preprocessor.preprocess(cut_less_than=cut_less_than)
    preprocessor.train_test_split()
    preprocessor.vectorize(ngram_range=ngram_range, binary=True, sublinear_tf=True)    
    preprocessor.count_transform()
    preprocessor.remove_cheat_words()
    X_train, X_test, ukr_train, ukr_test, channel_train, channel_test = \
    preprocessor.get_train_test_split()

    predictor = Predictor()
    predictor.train_LR(X_train, ukr_train)
    ukr_pred = predictor.predict_LR(X_test)
    accuracy = predictor.evaluate(ukr_test)
    print('-----------------------------------------')
    print(f'ngram={ngram_range}, cut_less_than={cut_less_than}, accuracy: {accuracy}%')
    print('-----------------------------------------')

preprocess took 1.28 seconds.
vectorize took 7.68 seconds.
count_transform took 6.17 seconds.
remove_cheat_words took 3.51 seconds.
-----------------------------------------
ngram=(1, 2), cut_less_than=0, accuracy: 89.92%
-----------------------------------------
preprocess took 1.29 seconds.
vectorize took 7.62 seconds.
count_transform took 6.06 seconds.
remove_cheat_words took 3.45 seconds.
-----------------------------------------
ngram=(1, 2), cut_less_than=18, accuracy: 90.55%
-----------------------------------------
preprocess took 1.28 seconds.
vectorize took 7.54 seconds.
count_transform took 6.08 seconds.
remove_cheat_words took 3.32 seconds.
-----------------------------------------
ngram=(1, 2), cut_less_than=100, accuracy: 91.26%
-----------------------------------------
preprocess took 1.28 seconds.
vectorize took 4.8 seconds.
count_transform took 3.59 seconds.
remove_cheat_words took 2.76 seconds.
-----------------------------------------
ngram=(1, 2), cut_less_than=500,

This section is for TfidfVectorizer.

In [7]:
best_pred = 0
best_options = [(1, 1), True, None]
for ngram_range in [(1, 1), (1, 2)]:
    for binary in [True, False]:
        preprocessor = Preprocessor(DATA)
        preprocessor.train_test_split()
        preprocessor.vectorize(ngram_range=ngram_range, binary=binary, sublinear_tf=True)    
        preprocessor.tfidf_transform()
        preprocessor.remove_cheat_words()
        X_train, X_test, ukr_train, ukr_test, channel_train, channel_test = \
        preprocessor.get_train_test_split()

        predictor = Predictor()
        predictor.train_LR(X_train, ukr_train)
        ukr_pred = predictor.predict_LR(X_test)
        accuracy = predictor.evaluate(ukr_test)
        print('-----------------------------------------')
        print(f'ngram={ngram_range}, binary={binary}, accuracy: {accuracy}%')
        print('-----------------------------------------')

vectorize took 2.44 seconds.
tfidf_transform took 2.97 seconds.
remove_cheat_words took 0.14 seconds.
-----------------------------------------
ngram=(1, 1), binary=True, accuracy: 86.59%
-----------------------------------------
vectorize took 2.44 seconds.
tfidf_transform took 2.97 seconds.
remove_cheat_words took 0.13 seconds.
-----------------------------------------
ngram=(1, 1), binary=False, accuracy: 86.57%
-----------------------------------------
vectorize took 8.01 seconds.
tfidf_transform took 6.74 seconds.
remove_cheat_words took 3.49 seconds.
-----------------------------------------
ngram=(1, 2), binary=True, accuracy: 88.75%
-----------------------------------------
vectorize took 8.02 seconds.
tfidf_transform took 6.77 seconds.
remove_cheat_words took 3.56 seconds.
-----------------------------------------
ngram=(1, 2), binary=False, accuracy: 88.7%
-----------------------------------------
