In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import mglearn

  _nan_object_mask = _nan_object_array != _nan_object_array


In [3]:
from sklearn.datasets import load_files
rev_train = load_files("txt_data/train")
text_train, y_train = rev_train.data, rev_train.target

In [4]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [7]:
np.bincount(y_test)

array([12500, 12500])

In [6]:
rev_test = load_files("txt_data/test")
text_test, y_test = rev_test.data, rev_test.target

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
w = ["The fool doth think he is wise,","but the wise man knows himself to be a fool"]
vect.fit(w)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
vect.vocabulary_

{'be': 0,
 'but': 1,
 'doth': 2,
 'fool': 3,
 'he': 4,
 'himself': 5,
 'is': 6,
 'knows': 7,
 'man': 8,
 'the': 9,
 'think': 10,
 'to': 11,
 'wise': 12}

In [12]:
bow = vect.transform(w)
print(bow)
print(bow.toarray())

  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 6)	1
  (0, 9)	1
  (0, 10)	1
  (0, 12)	1
  (1, 0)	1
  (1, 1)	1
  (1, 3)	1
  (1, 5)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 11)	1
  (1, 12)	1
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [13]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)

In [14]:
X_train.shape

(25000, 74849)

In [18]:
fnames = vect.get_feature_names()
print(fnames[-10:])

['était', 'état', 'étc', 'évery', 'êxtase', 'ís', 'ísnt', 'østbye', 'über', 'üvegtigris']


In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
np.mean(scores)



0.88127999999999995

In [36]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid.best_score_)



0.88148


In [37]:
print(grid.best_params_)

{'C': 0.1}


In [38]:
X_test = vect.transform(text_test)
grid.score(X_test, y_test)

0.87292000000000003

In [23]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)

In [28]:
fnames = vect.get_feature_names()
print(fnames[:10])
print(len(fnames))

['00', '000', '007', '00s', '01', '02', '03', '04', '05', '06']
27271


In [33]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
list(ENGLISH_STOP_WORDS)[::20]

['please',
 'for',
 'several',
 'call',
 'cant',
 'or',
 'too',
 're',
 'you',
 'anyone',
 'nowhere',
 'us',
 'seeming',
 'have',
 'were',
 'hundred']

In [34]:
vect = CountVectorizer(min_df=5, stop_words='english').fit(text_test)
X_train = vect.transform(text_train)

In [35]:
X_train.shape

(25000, 26446)

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None), LogisticRegression())
param_grid = {'logisticregression__C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print(grid.best_score_)



0.89392


In [45]:
vectorizer = grid.best_estimator_.named_steps['tfidfvectorizer']
X_train = vectorizer.transform(text_train)
X_train.shape

(25000, 27271)

In [47]:
max_value = X_train.max(axis=0).toarray().ravel()
print(max_value)

[ 20.15985557  34.37105041  16.70816476 ...,  25.46284132  28.0047349
   9.33491163]


In [48]:
sorted_by_tfidf = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names())
print(feature_names[sorted_by_tfidf[:20]])


['poignant' 'disagree' 'instantly' 'importantly' 'lacked' 'occurred'
 'currently' 'altogether' 'nearby' 'undoubtedly' 'directs' 'fond' 'stinker'
 'avoided' 'emphasis' 'commented' 'disappoint' 'realizing' 'downhill'
 'inane']


In [49]:
print(feature_names[sorted_by_tfidf[-20:]])

['coop' 'homer' 'dillinger' 'hackenstein' 'gadget' 'taker' 'macarthur'
 'vargas' 'jesse' 'basket' 'dominick' 'the' 'victor' 'bridget' 'victoria'
 'khouri' 'zizek' 'rob' 'timon' 'titanic']


In [51]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print(feature_names[sorted_by_idf[-20:]])
print(feature_names[sorted_by_idf[:20]])

['tenement' 'tenements' 'tenets' 'preclude' 'gravelly' 'grandest' 'gravely'
 'greendale' 'tenure' 'calligraphy' 'tepper' 'prc' 'greenstreet'
 'callaghan' 'caligula' 'pratfall' 'prankster' 'prakash' 'grayce' 'émigré']
['the' 'and' 'of' 'to' 'this' 'is' 'it' 'in' 'that' 'but' 'for' 'with'
 'was' 'as' 'on' 'movie' 'not' 'have' 'one' 'be']


In [61]:
cv = CountVectorizer(ngram_range=(2,10)).fit(w)
print(cv.vocabulary_)

{'the fool': 33, 'fool doth': 13, 'doth think': 9, 'think he': 46, 'he is': 18, 'is wise': 23, 'the fool doth': 34, 'fool doth think': 14, 'doth think he': 10, 'think he is': 47, 'he is wise': 19, 'the fool doth think': 35, 'fool doth think he': 15, 'doth think he is': 11, 'think he is wise': 48, 'the fool doth think he': 36, 'fool doth think he is': 16, 'doth think he is wise': 12, 'the fool doth think he is': 37, 'fool doth think he is wise': 17, 'the fool doth think he is wise': 38, 'but the': 1, 'the wise': 39, 'wise man': 51, 'man knows': 28, 'knows himself': 24, 'himself to': 20, 'to be': 49, 'be fool': 0, 'but the wise': 2, 'the wise man': 40, 'wise man knows': 52, 'man knows himself': 29, 'knows himself to': 25, 'himself to be': 21, 'to be fool': 50, 'but the wise man': 3, 'the wise man knows': 41, 'wise man knows himself': 53, 'man knows himself to': 30, 'knows himself to be': 26, 'himself to be fool': 22, 'but the wise man knows': 4, 'the wise man knows himself': 42, 'wise ma

In [62]:
vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10, learning_method="batch", max_iter=25)
document_topics = lda.fit_transform(X)



In [66]:
lda.components_.shape
document_topics.shape

(25000, 10)

In [65]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
action        horror        music         comedy        war           
effects       director      musical       role          world         
zombie        quite         song          cast          american      
special       interesting   songs         performance   documentary   
fight         nothing       dance         actors        us            
evil          script        jane          funny         history       
zombies       pretty        dancing       actor         years         
monster       though        singing       excellent     our           
fi            seems         rock          wonderful     black         
sci           however       kelly         always        political     


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
poli

In [68]:
m = np.argsort(document_topics[:, 2])[::-1]
for i in m[:10]:
    print(text_train[i] + b"\n")

b'Fred Astaire and Ginger Rogers, Hollywood\'s premiere dance team, were usually dressed to the nines and gliding through elaborately exaggerated Art Deco sets in the 1930\'s. However, they go a bit more downscale for this 1936 outing, the fifth of their ten musicals together. This time, Astaire foregoes his top hat, white tie and tails to become a bubblegum-chewing sailor named "Bake" Baker; and Rogers plays dance hall entertainer Sherry Martin, who was Bake\'s partner - dancing and otherwise - before he enlisted. Consequently, unlike the mistaken identity ploys and romantic hesitancies prevalent in most of their previous pairings, they are already a couple from the film\'s outset.  Directed by Mark Sandrich (who guided five of their pairings), the film bears a narrative similarity to 1935\'s "Roberta" in which they are but one of two couples featured in the storyline. In fact, Randolph Scott plays the other male lead in both films, this time as Bake\'s womanizing crewmate, "Bilge" Sm

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10, 10))
topic_names = ["{:>2}".format(i) + " ".join(words) for i, words in enumerate(feature_names[sorting[:, :2]])]
for col in [0,1]:
    start = col * 50
    end = (col+1) * 50
    ax[col].barh(np.arange(50), np.sum(document_topics