# Load Dataset

In [2]:
from sklearn.datasets import load_files
import numpy as np

# data train
reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target

print("tipe data text_train: {}".format(type(text_train)))
print("panjang data text_train: {}".format(len(text_train)))
print("data pertama text_train: {}".format(text_train[1]))

# data test
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target

print("tipe data text_test: {}".format(type(text_test)))
print("panjang data text_test: {}".format(len(text_test)))
print("jumlah sample tiap kelas untuk data testing: {}".format(np.bincount(y_test)))
print("data pertama text_test: {}".format(text_test[1]))

tipe data text_train: <class 'list'>
panjang data text_train: 25000
data pertama text_train: b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Ex

# Preprocessing

In [3]:
import re

REPLACE_TANPA_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_DENGAN_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_TANPA_SPACE.sub("", line.decode('utf-8').lower()) for line in reviews]
    reviews = [REPLACE_DENGAN_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

text_train = preprocess_reviews(text_train)
text_test = preprocess_reviews(text_test)

In [4]:
# transform ke bag of words
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=1000, max_df=15)
X = vect.fit_transform(text_train)

# Latent Diricihlet Allocation

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=30)

document_topics = lda.fit_transform(X)

In [7]:
lda.components_.shape

(10, 1000)

In [8]:
# sorting
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

# get feature names
feature_names = np.array(vect.get_feature_names())

In [10]:
# plot
import mglearn

In [11]:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
chiba         creasy        gamera        shin          myrtle        
cena          sarne         dillinger     gannon        jigsaw        
hanzo         deathstalker  scarecrows    paperhouse    kornbluth     
pia           pita          mabel         vivah         darius        
abu           victorias     dahmer        mj            aztec         
aweigh        batwoman      harilal       tremors       dominick      
taker         lamas         della         chairman      carface       
benoit        cortez        floriane      baseketball   caruso        
ahmad         wai           durbin        nell          delia         
munchies      waqt          morgana       ae            barman        


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
maca

In [14]:
# 100 topics
lda100 = LatentDirichletAllocation(n_components=100, learning_method="batch",
                                   max_iter=25, random_state=30)
document_topics100 = lda100.fit_transform(X)

In [18]:
topics = np.array([10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95])

sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=topics, feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=7, n_words=20)

topic 10      topic 15      topic 20      topic 25      topic 30      topic 35      topic 40      
--------      --------      --------      --------      --------      --------      --------      
floriane      kali          taker         flamenco      dahmer        jed           kornbluth     
renoir        heaton        benoit        fencing       cortez        doolittle     baseketball   
orca          reloaded      booker        culp          beta          chikatilo     mordrid       
whales        jackies       marjorie      jud           delon         wargames      gandalf       
iago          ming          cena          moby          kattan        lamb          zucker        
oberon        gamers        brock         saura         prague        barrett       bikers        
desdemona     mona          bubba         calf          stifler       mammoth       pitcher       
iphigenia     livingston    colonies      swashbuckling silverstone   aweigh        aragorn       
ra        

In [28]:
topic90 = np.argsort(document_topics100[:, 90])[::-1]

for i in topic90[:2]:
    print(".".join(text_train[i].split(".")[:2]) + ".\n")

i just saw behind bedroom doors and this was the first softcore flick with a solid story behind it that ive seen in a while we begin with two neighborly couples  vivian and james fenway julia kruis and eric carrington and lillian and gabe harris nicole sheridan and chris gustafson vivian appears to be a housewife james is a lawyer running for district attorney lillian works in real estate and gabe is a successful plastic surgeon got all that now lets get into it enter abby played beautifully by porn star chelsea blue shes renting the house across the street from the fenways and lives all by herself at the beginning of the movie james looks out his window and sees abby engaged in playtime with her girlfriend gigi played by prolific pornstress monique alexander and secretly begins to wonder what it would be like to be with her the next day abby gets acquainted with all four of them and appears to be a nice woman who just happens to be living an alternative lifestyle she makes a pass at v