In [1]:
import os
import sys
import tarfile
import time
import urllib.request


source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = 'aclImdb_v1.tar.gz'


def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = progress_size / (1024.**2 * duration)
    percent = count * block_size * 100. / total_size

    sys.stdout.write("\r%d%% | %d MB | %.2f MB/s | %d sec elapsed" %
                    (percent, progress_size / (1024.**2), speed, duration))
    sys.stdout.flush()


# if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
urllib.request.urlretrieve(source, target)

('aclImdb_v1.tar.gz', <http.client.HTTPMessage at 0x2a57409a8e0>)

In [2]:
with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

In [3]:
!pip install pyprind

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [5]:
import pyprind
import pandas as pd
import os

# `basepath`를 압축 해제된 영화 리뷰 데이터셋이 있는
# 디렉토리로 바꾸세요

basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

  df = df.append([[txt, labels[l]]],
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:07:11


In [6]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

In [7]:
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [8]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [10]:
df.shape

(50000, 2)

In [23]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two',
        'zzz zz'
])
new = count.fit_transform(docs)

In [25]:
new.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0],
       [2, 3, 2, 1, 1, 1, 2, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [31]:
count = CountVectorizer()
X = df.iloc[:,0]
print(X.head())
y = df.iloc[:,1]
print(y.head())
count.fit(X)
X_vec = count.transform(X)

0    In 1974, the teenager Martha Moxley (Maggie Gr...
1    OK... so... I really like Kris Kristofferson a...
2    ***SPOILER*** Do not read this, if you think a...
3    hi for all the people who have seen this wonde...
4    I recently bought the DVD, forgetting just how...
Name: review, dtype: object
0    1
1    0
2    0
3    1
4    0
Name: sentiment, dtype: int64


In [21]:
col = np.unique(np.array(" ".join(docs).split()))toarray

In [22]:
pd.Series(col)

0          The
1          and
2           is
3          one
4      shining
5     shining,
6          sun
7        sweet
8       sweet,
9          the
10         two
11     weather
dtype: object

In [32]:
word_ind = count.vocabulary_

In [33]:
ind_word = { v:k for k,v in word_ind.items()}

In [34]:
ind_word[0]

'00'

In [35]:
X_vec.toarray()[:,0]

MemoryError: Unable to allocate 38.0 GiB for an array with shape (50000, 101895) and data type int64

In [36]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [37]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [38]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [39]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [40]:
df['review'] = df['review'].apply(preprocessor)

In [41]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [57]:
df.review

0        in 1974 the teenager martha moxley maggie grac...
1        ok so i really like kris kristofferson and his...
2         spoiler do not read this if you think about w...
3        hi for all the people who have seen this wonde...
4        i recently bought the dvd forgetting just how ...
                               ...                        
49995    ok lets start with the best the building altho...
49996    the british heritage film industry is out of c...
49997    i don t even know where to begin on this one i...
49998    richard tyler is a little boy who is scared of...
49999    i waited long to watch this movie also because...
Name: review, Length: 50000, dtype: object

In [58]:
from collections import Counter

In [62]:
word_list = []
for words in df.review:
    word_list.extend(words.split())

In [63]:
word_cnt = Counter(word_list)

In [65]:
used_word = word_cnt.most_common()[:10000]

In [66]:
used_word = [i[0] for i in used_word]

In [69]:
df.head(1)

Unnamed: 0,review,sentiment
0,in 1974 the teenager martha moxley maggie grac...,1


In [72]:
word_index = {v: i+1 for i, v in enumerate(used_word)}

In [74]:
index_word = {v:k for k,v in word_index.items()}

In [80]:
w =[]
for i in used_word:
    if i in df.review[0].split(): 
        w.append(i)

In [82]:
len(w)

129

In [90]:
len(df.review[0].split())

240

ValueError: could not convert string to float: 'dfaf kkk kk '

In [92]:
review = []
for idx, value in enumerate(df.review):
    w = []
    for word in used_word:
        if word in value.split(): 
            w.append(word)
        review.append(w)

NameError: name 'pd_Series' is not defined

In [96]:
from sklearn.feature_extraction.text import CountVectorizer
countvector = CountVectorizer()
countvector.fit(imdb.review)
X = countvector.transform(imdb.review).toarray()

NameError: name 'imdb' is not defined

In [1]:
from urllib.requestuest import urlopen

ModuleNotFoundError: No module named 'urllib.requestuest'