# Comparison of some Bag-of-words representations

## Show me the code

In [143]:
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import pandas as pd
import sklearn
import nltk

In [20]:
from sklearn.datasets import fetch_20newsgroups

In [21]:
CATEGORIES = ["comp.graphics", "sci.space", "rec.sport.baseball"]

train_data = fetch_20newsgroups(subset='train', categories=CATEGORIES)
test_data = fetch_20newsgroups(subset='test', categories=CATEGORIES)

In [22]:
train_data.data[:2]

["From: jk87377@lehtori.cc.tut.fi (Kouhia Juhana)\nSubject: Re: More gray levels out of the screen\nOrganization: Tampere University of Technology\nLines: 21\nDistribution: inet\nNNTP-Posting-Host: cc.tut.fi\n\nIn article <1993Apr6.011605.909@cis.uab.edu> sloan@cis.uab.edu\n(Kenneth Sloan) writes:\n>\n>Why didn't you create 8 grey-level images, and display them for\n>1,2,4,8,16,32,64,128... time slices?\n\nBy '8 grey level images' you mean 8 items of 1bit images?\nIt does work(!), but it doesn't work if you have more than 1bit\nin your screen and if the screen intensity is non-linear.\n\nWith 2 bit per pixel; there could be 1*c_1 + 4*c_2 timing,\nthis gives 16 levels, but they are linear if screen intensity is\nlinear.\nWith 1*c_1 + 2*c_2 it works, but we have to find the best\ncompinations -- there's 10 levels, but 16 choises; best 10 must be\nchosen. Different compinations for the same level, varies a bit, but\nthe levels keeps their order.\n\nReaders should verify what I wrote... :-

In [23]:
train_data.target[:2]

array([0, 0])

In [24]:
# Checking if the nltk dependecies are already satisfied. If not, downloads the dependencies
nltk.download('rslp')
nltk.download('stopwords')

def text_tokenizer(text):
    stemmer = nltk.stem.RSLPStemmer()
    lower_case = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(lower_case)
    return [stemmer.stem(t) for t in tokens]

[nltk_data] Downloading package rslp to
[nltk_data]     /home/wespatrocinio/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wespatrocinio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [120]:
counter = CountVectorizer(
    encoding='utf-8',
    decode_error='replace',
    strip_accents='unicode',
    analyzer='word',
    binary=True,
    stop_words=nltk.corpus.stopwords.words("english"),
    tokenizer=text_tokenizer
)

In [121]:
tfidf = TfidfVectorizer(
    encoding='utf-8',
    decode_error='replace',
    strip_accents='unicode',
    analyzer='word',
    binary=False,
    stop_words=nltk.corpus.stopwords.words("english"),
    tokenizer=text_tokenizer
)

In [122]:
hashing = HashingVectorizer(
    encoding='utf-8',
    decode_error='replace',
    strip_accents='unicode',
    analyzer='word',
    binary=False,
    stop_words=nltk.corpus.stopwords.words("english"),
    tokenizer=text_tokenizer
)

In [153]:
tf = TfidfVectorizer(
    max_df=0.5,
    min_df=2,
    stop_words='english',
    use_idf=False
)

In [123]:
counter.fit(train_data.data)

CountVectorizer(analyzer='word', binary=True, decode_error='replace',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function text_tokenizer at 0x7f2086f3d730>,
        vocabulary=None)

In [124]:
c_train = pd.DataFrame(counter.transform(train_data.data).toarray())

In [125]:
c_test = pd.DataFrame(counter.transform(test_data.data).toarray())

In [126]:
c_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25198,25199,25200,25201,25202,25203,25204,25205,25206,25207
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
tfidf.fit(train_data.data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='replace',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function text_tokenizer at 0x7f2086f3d730>,
        use_idf=True, vocabulary=None)

In [47]:
t_train = pd.DataFrame(tfidf.transform(train_data.data).toarray())

In [48]:
t_test = pd.DataFrame(tfidf.transform(test_data.data).toarray())

In [34]:
t_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25198,25199,25200,25201,25202,25203,25204,25205,25206,25207
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
tf.fit(train_data.data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=False,
        vocabulary=None)

In [155]:
tf_train = pd.DataFrame(tf.transform(train_data.data).toarray())

In [156]:
tf_test = pd.DataFrame(tf.transform(test_data.data).toarray())

In [157]:
tf_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14935,14936,14937,14938,14939,14940,14941,14942,14943,14944
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
tf_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14935,14936,14937,14938,14939,14940,14941,14942,14943,14944
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [135]:
c_dt = DecisionTreeClassifier()

In [136]:
c_dt = c_dt.fit(c_train, train_data.target)

In [137]:
c_test_predict = c_dt.predict(c_test)

In [138]:
print(metrics.classification_report(test_data.target, c_test_predict))

             precision    recall  f1-score   support

          0       0.81      0.85      0.83       389
          1       0.84      0.85      0.85       397
          2       0.85      0.80      0.82       394

avg / total       0.83      0.83      0.83      1180



In [139]:
t_dt = DecisionTreeClassifier(max_depth=9)

In [140]:
t_dt = t_dt.fit(t_train, train_data.target)

In [141]:
t_test_predict = t_dt.predict(t_test)

In [142]:
print(metrics.classification_report(test_data.target, t_test_predict))

             precision    recall  f1-score   support

          0       0.74      0.84      0.79       389
          1       0.93      0.69      0.79       397
          2       0.74      0.82      0.78       394

avg / total       0.80      0.79      0.79      1180



In [163]:
tf_dt = DecisionTreeClassifier(max_depth=9)

In [164]:
tf_dt = tf_dt.fit(tf_train, train_data.target)

In [165]:
tf_test_predict = tf_dt.predict(tf_test)

In [166]:
print(metrics.classification_report(test_data.target, tf_test_predict))

             precision    recall  f1-score   support

          0       0.57      0.94      0.71       389
          1       0.94      0.54      0.68       397
          2       0.89      0.70      0.78       394

avg / total       0.80      0.72      0.72      1180



In [69]:
from sklearn.linear_model import LogisticRegression

In [86]:
c_lr = LogisticRegression(
    solver='lbfgs',
    multi_class="multinomial",
    C=2,
    penalty='l2',
    fit_intercept=False,
    max_iter=3,
    random_state=42,
)

In [87]:
c_lr.fit(c_train, train_data.target)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=3, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [88]:
c_lr_pred = c_lr.predict(c_test)

In [89]:
print(metrics.classification_report(test_data.target, c_lr_pred))

             precision    recall  f1-score   support

          0       0.92      0.93      0.92       389
          1       0.94      0.97      0.95       397
          2       0.95      0.92      0.94       394

avg / total       0.94      0.94      0.94      1180



In [82]:
t_lr = LogisticRegression(
    solver='lbfgs',
    multi_class="multinomial",
    C=2,
    penalty='l2',
    fit_intercept=False,
    max_iter=3,
    random_state=42,
)

In [83]:
t_lr.fit(t_train, train_data.target)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=3, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [84]:
t_lr_pred = t_lr.predict(t_test)

In [82]:
t_lr = LogisticRegression(
    solver='lbfgs',
    multi_class="multinomial",
    C=2,
    penalty='l2',
    fit_intercept=False,
    max_iter=3,
    random_state=42,
)

In [83]:
t_lr.fit(t_train, train_data.target)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=3, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [84]:
t_lr_pred = t_lr.predict(t_test)

In [85]:
print(metrics.classification_report(test_data.target, t_lr_pred))

             precision    recall  f1-score   support

          0       0.94      0.96      0.95       389
          1       0.98      0.99      0.98       397
          2       0.97      0.95      0.96       394

avg / total       0.97      0.97      0.97      1180



In [85]:
print(metrics.classification_report(test_data.target, t_lr_pred))

             precision    recall  f1-score   support

          0       0.94      0.96      0.95       389
          1       0.98      0.99      0.98       397
          2       0.97      0.95      0.96       394

avg / total       0.97      0.97      0.97      1180

