In [1]:
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vect=CountVectorizer()
vect.fit(bards_words)

In [3]:
# Fitting the CountVectorizer consists of the tokenization of the training data and building of the vocabulary, which we can access as the vocabulary_ attribute

print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content: \n{}".format(vect.vocabulary_))

Vocabulary size: 13
Vocabulary content: 
{'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [4]:
# To create the bag-of-words representation for the training data, we call the transform method

bag_of_words=vect.transform(bards_words)
print("bag_of_words {}".format(repr(bag_of_words)))

bag_of_words <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 16 stored elements and shape (2, 13)>


In [5]:
# converting sparse ,matrix to a dense matrix

print("Dense representation of bag_of_words: \n{}".format(bag_of_words.toarray()))

Dense representation of bag_of_words: 
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [42]:
import pandas as pd

review_df=pd.read_csv("datasets/IMDB Dataset.csv")

X,y=review_df["review"],review_df["sentiment"]

In [43]:
X=[doc.replace("<br />", " ") for doc in X]

In [44]:
import numpy as np
print("samples per class: {}".format(y.value_counts()))

samples per class: sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [45]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [46]:
vect=CountVectorizer().fit(X_train)

In [47]:
X_train=vect.transform(X_train)

In [48]:
print("X_train: \n{}".format(repr(X_train)))

X_train: 
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5089993 stored elements and shape (37500, 90506)>


In [51]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores=cross_val_score(LogisticRegression(max_iter=1000),X_train,y_train,cv=5)

In [52]:
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.88


In [54]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross-validation score: 0.89
Best parameters:  {'C': 0.1}


In [55]:
X_test=vect.transform(X_test)
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

Test score: 0.89


## Stopwords

Another way that we can get rid of uninformative words is by discarding words that are too frequent to be informative. There are two main approaches: using a language-specific list of stopwords, or discarding words that appear too frequently. scikit-learn has a built-in list of English stopwords in the feature_extraction.text module

In [63]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword: \n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword: 
['done', 'whereas', 'my', 'somewhere', 'again', 'detail', 'therein', 'few', 'many', 'until', 'above', 'whereafter', 'than', 'or', 'six', 'onto', 'do', 'back', 'those', 'full', 'eg', 'among', 'its', 'thereafter', 'thereby', 'give', 'seems', 'other', 'if', 'wherever', 'see', 'had']


In [64]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [65]:
vect=CountVectorizer(min_df=5,stop_words="english").fit(X_train)
X_train=vect.transform(X_train)

In [66]:
print("X_train with stop words:\n{}".format(repr(X_train)))

X_train with stop words:
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3205987 stored elements and shape (37500, 32365)>


In [69]:
grid=GridSearchCV(LogisticRegression(max_iter=1000),param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.89


In [70]:
X_test=vect.transform(X_test)

In [71]:
print("Testing accuracy: {:.2f}".format(grid.score(X_test,y_test)))

Testing accuracy: 0.89


In [72]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [73]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipe=make_pipeline(TfidfVectorizer(min_df=5),LogisticRegression(max_iter=1000))

param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.90


In [77]:
print("bards_words:\n{}".format(bards_words))

bards_words:
['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']


In [82]:
cv=CountVectorizer(ngram_range=(1,1)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary: \n{}".format(cv.get_feature_names_out()))

Vocabulary size: 13
Vocabulary: 
['be' 'but' 'doth' 'fool' 'he' 'himself' 'is' 'knows' 'man' 'the' 'think'
 'to' 'wise']


In [83]:
cv=CountVectorizer(ngram_range=(2,2)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocab content: \n{}".format(cv.get_feature_names_out()))

Vocabulary size: 14
Vocab content: 
['be fool' 'but the' 'doth think' 'fool doth' 'he is' 'himself to'
 'is wise' 'knows himself' 'man knows' 'the fool' 'the wise' 'think he'
 'to be' 'wise man']


In [84]:
print("Transformed data (dense: \n{})".format(cv.transform(bards_words).toarray()))

Transformed data (dense: 
[[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]])


In [86]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [87]:
pipe=make_pipeline(TfidfVectorizer(min_df=5),LogisticRegression(max_iter=1000))

param_grid={"logisticregression__C":[0.001, 0.01, 0.1, 1, 10, 100],
            "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

Best cross-validation score: 0.91
Best parameters:
{'logisticregression__C': 10, 'tfidfvectorizer__ngram_range': (1, 3)}
