# Sentiment Analysis

# load data

In [1]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")

text_train, y_train = reviews_train.data, reviews_train.target

print("tipe data text_train: {}".format(type(text_train)))
print("panjang data text_train: {}".format(len(text_train)))
print("data pertama text_train: {}".format(text_train[1]))

tipe data text_train: <class 'list'>
panjang data text_train: 25000
data pertama text_train: b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Ex

In [2]:
import numpy as np

print("jumlah sample tiap kelas untuk data training: {}".format(np.bincount(y_train)))

jumlah sample tiap kelas untuk data training: [12500 12500]


In [3]:
reviews_test = load_files("data/aclImdb/test/")

text_test, y_test = reviews_test.data, reviews_test.target

print("tipe data text_test: {}".format(type(text_test)))
print("panjang data text_test: {}".format(len(text_test)))
print("jumlah sample tiap kelas untuk data testing: {}".format(np.bincount(y_test)))
print("data pertama text_test: {}".format(text_test[1]))

tipe data text_test: <class 'list'>
panjang data text_test: 25000
jumlah sample tiap kelas untuk data testing: [12500 12500]
data pertama text_test: b'I don\'t know how this movie has received so many positive comments. One can call it "artistic" and "beautifully filmed", but those things don\'t make up for the empty plot that was filled with sexual innuendos. I wish I had not wasted my time to watch this movie. Rather than being biographical, it was a poor excuse for promoting strange and lewd behavior. It was just another Hollywood attempt to convince us that that kind of life is normal and OK. From the very beginning I asked my self what was the point of this movie,and I continued watching, hoping that it would change and was quite disappointed that it continued in the same vein. I am so glad I did not spend the money to see this in a theater!'


## Preprocessing

In [5]:
import re

REPLACE_TANPA_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_DENGAN_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_TANPA_SPACE.sub("", line.decode('utf-8').lower()) for line in reviews]
    reviews = [REPLACE_DENGAN_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

text_train = preprocess_reviews(text_train)
text_test = preprocess_reviews(text_test)

In [6]:
print(text_train[0])

zero day leads you to think even re think why two boys young men would do what they did   commit mutual suicide via slaughtering their classmates it captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own mutual world via coupled destruction it is not a perfect movie but given what money time the filmmaker and actors had   it is a remarkable product in terms of explaining the motives and actions of the two young suicide murderers it is better than elephant   in terms of being a film that gets under our rationalistic skin it is a far far better film than almost anything you are likely to see  flawed but honest with a terrible honesty


### Membentuk Bag-of-Words

In [7]:
baw_words = ["Saya menyukai bermain sepakbola di lapangan sepakbola",
             "Budi bermain sepakbola bersama teman-teman di lapangan"]

In [10]:
# tokenisasi
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(baw_words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
print(len(vect.vocabulary_))
print(vect.vocabulary_)

9
{'saya': 6, 'menyukai': 5, 'bermain': 0, 'sepakbola': 7, 'di': 3, 'lapangan': 4, 'budi': 2, 'bersama': 1, 'teman': 8}


In [14]:
bag_of_words = vect.transform(baw_words)
print(repr(bag_of_words))
print(bag_of_words)

<2x9 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>
  (0, 0)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	2
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	2


In [15]:
# merubah menjadi array
print(bag_of_words.toarray())

[[1 0 0 1 1 1 1 2 0]
 [1 1 1 1 1 0 0 1 2]]


## BoW data Imdb

In [16]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print(repr(X_train))

<25000x92715 sparse matrix of type '<class 'numpy.int64'>'
	with 3461902 stored elements in Compressed Sparse Row format>


In [17]:
feature_names = vect.get_feature_names()
print(len(feature_names))
print(feature_names[:20])

92715
['00', '000', '0000000000001', '000001', '0001', '00015', '001', '002', '003830', '006', '007', '0079', '0080', '0083', '00s', '01', '010', '0130', '02', '020410html']


In [18]:
# uji klasifikasi dengan logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

0.88688
{'C': 0.1}


In [19]:
X_test = vect.transform(text_test)
print(grid.score(X_test, y_test))

0.87952


In [20]:
# mengatasi data/fitur yang tidak berguna
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)

In [21]:
print(repr(X_train))

<25000x27994 sparse matrix of type '<class 'numpy.int64'>'
	with 3364141 stored elements in Compressed Sparse Row format>


In [23]:
feature_names = vect.get_feature_names()
print(len(feature_names))
print(feature_names[:20])
print(feature_names[20000:20011])

27994
['00', '007', '00s', '01', '02', '05', '06', '07', '08', '09', '10', '100', '1000', '10000', '100000', '1000000', '100th', '101', '102', '103']
['rasuk', 'rat', 'ratchet', 'rate', 'rated', 'rates', 'rathbone', 'rather', 'rating', 'ratings', 'ratio']


In [24]:
# retrain
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(grid.best_score_)

0.88648


## Menghapus Stopwords

In [30]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print(len(ENGLISH_STOP_WORDS))
print(list(ENGLISH_STOP_WORDS)[:10])

318
['though', 'your', 'cry', 'via', 'hereupon', 'ltd', 'nothing', 'whereupon', 'five', 'across']


In [31]:
# tambah stop_words pada CountVectorizer
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)

print(repr(X_train))

<25000x27688 sparse matrix of type '<class 'numpy.int64'>'
	with 2161888 stored elements in Compressed Sparse Row format>


In [32]:
# gunakan GridSearchCV
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(grid.best_score_)

0.88152


## Rescaling Data dengan tf-idf

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None), LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)

print(grid.best_score_)

0.8938


In [39]:
vectorizer = grid.best_estimator_.named_steps['tfidfvectorizer']

# transform data traininig
X_train = vectorizer.transform(text_train)

# cari nilai maximum dari fitur
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

# nama fitur
feature_names = np.array(vectorizer.get_feature_names())

print("Fitur dengan tf-idf terkecil:")
print(feature_names[sorted_by_tfidf[:20]])

print("Fitur dengan tf-idf terbesar:")
print(feature_names[sorted_by_tfidf[-20:]])

Fitur dengan tf-idf terkecil:
['poignant' 'disagree' 'instantly' 'importantly' 'lacked' 'currently'
 'occurred' 'altogether' 'nearby' 'undoubtedly' 'fond' 'directs' 'avoided'
 'emphasis' 'commented' 'stinker' 'disappoint' 'realizing' 'downhill'
 'inane']
Fitur dengan tf-idf terbesar:
['kornbluth' 'europa' 'ripley' 'roy' 'blob' 'gadget' 'dillinger'
 'hackenstein' 'basket' 'homer' 'dominick' 'bridget' 'taker' 'vargas'
 'jesse' 'victor' 'timon' 'the' 'rob' 'titanic']


In [40]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print("Fitur dengan idf terkecil:")
print(feature_names[sorted_by_idf[:100]])

Fitur dengan idf terkecil:
['the' 'and' 'of' 'to' 'this' 'is' 'in' 'it' 'that' 'for' 'but' 'with'
 'was' 'as' 'on' 'movie' 'not' 'have' 'be' 'one' 'are' 'film' 'all' 'at'
 'you' 'its' 'an' 'by' 'from' 'so' 'like' 'who' 'his' 'if' 'out' 'just'
 'about' 'they' 'or' 'has' 'he' 'there' 'some' 'good' 'what' 'more' 'when'
 'time' 'very' 'up' 'even' 'only' 'no' 'my' 'see' 'would' 'can' 'really'
 'story' 'which' 'had' 'well' 'me' 'than' 'much' 'were' 'their' 'get'
 'other' 'been' 'do' 'most' 'her' 'also' 'into' 'first' 'made' 'dont'
 'great' 'how' 'will' 'because' 'make' 'people' 'way' 'bad' 'could' 'any'
 'after' 'too' 'then' 'movies' 'them' 'we' 'watch' 'think' 'acting' 'seen'
 'characters' 'she']


# Final Predict

In [43]:
# score predict
grid.score(text_test, y_test)

0.88508