In [7]:
import tarfile

In [10]:
!tar -zxf aclImdb_v1.tar

In [8]:
cd Downloads/

/Users/hasegawatakashikana/Downloads


In [14]:
import pyprind

In [13]:
!pip install PyPrind

Collecting PyPrind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: PyPrind
Successfully installed PyPrind-2.11.2
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [15]:
!pip install --upgrade pip

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/f9/fb/863012b13912709c13cf5cfdbfb304fa6c727659d6290438e1a88df9d848/pip-19.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 510kB/s ta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.0.3
    Uninstalling pip-19.0.3:
      Successfully uninstalled pip-19.0.3
Successfully installed pip-19.1


In [16]:
import pandas as pd
import os

In [100]:
#"basepath"の値を展開した映画レビューデータセットのディレクトリに置き換える
basepath = "aclImdb"
labels = {"pos":1,"neg":0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ("test","train"):
    for l in ("pos","neg"):
        path = os.path.join(basepath,s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),"r",encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt,labels[l]]],ignore_index =True)
            pbar.update()
df.columns =["review","sentiment"]

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:05:44


In [101]:
import numpy as np
np.random.seed(0)
#行の順番をシャッフル
df = df.reindex(np.random.permutation(df.index))
df.to_csv("movie_data.csv",index = False,encoding="utf-8")

In [102]:
df  = pd.read_csv("movie_data.csv",encoding="utf-8")
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


In [103]:
from sklearn.feature_extraction.text import CountVectorizer

In [104]:
count = CountVectorizer()
docs = np.array([
    "The sun is shining",
    "The weather is sweet",
    "The sun is shining, the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [105]:
print(count.vocabulary_)

{'weather': 8, 'two': 7, 'and': 0, 'sweet': 5, 'sun': 4, 'the': 6, 'shining': 3, 'one': 2, 'is': 1}


In [106]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [107]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,norm="l2",smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [108]:
import re

In [109]:
def preprocessor(text):
    text = re.sub("<[^>]*>","",text)
    emotions = re.findall("(?::|;|=)(?:-)?(?:\)|\(D|P)",text)
    text = (re.sub("[\W]+"," ",text.lower())+" ".join(emotions).replace("-"," "))
    return text

In [110]:
preprocessor(df.loc[20, 'review'][-50:])

'mice and men it seems 1 2 of four '

In [111]:
df["review"] = df["review"].apply(preprocessor)

In [112]:
def tokenizer(text):
    return text.split()

tokenizer("runners like running and thus they run")

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [56]:
!pip install nltk



In [113]:
from nltk import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [114]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hasegawatakashikana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [115]:
from nltk.corpus import stopwords
stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [116]:
X_train = df.loc[:25000,"review"].values
y_train = df.loc[:25000,"sentiment"].values
X_test = df.loc[25000:,"review"].values
y_test = df.loc[25000:,"sentiment"].values

In [117]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
              "vect__tokenizer":[tokenizer,tokenizer_porter],
              "clf__penalty":["l1","l2"],
              "clf__C":[1.0,10.0,100.0]},
              {"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
              "vect__tokenizer":[tokenizer,tokenizer_porter],
              "vect__use_idf":[False],
              "vect__norm":[None],
              "clf__penalty":["l1","l2"],
              "clf__C":[1.0,10.0,100.0]}]

In [120]:
lr_tfidf = Pipeline([("vect",tfidf),("clf",LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring="accuracy",cv=5,verbose=1,n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [119]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values





tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [121]:

basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:06:00


In [123]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
df = pd.read_csv('movie_data.csv', encoding='utf-8')
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
np.set_printoptions(precision=2)
tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)
tf_is = 3
n_docs = 3
idf_is = np.log((n_docs+1) / (3+1))
tfidf_is = tf_is * (idf_is + 1)
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]

l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

preprocessor(df.loc[0, 'review'][-50:])




preprocessor("</a>This :) is :( a test :-)!")




df['review'] = df['review'].apply(preprocessor)



In [125]:
porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]




tokenizer('runners like running and thus they run')




tokenizer_porter('runners like running and thus they run')





nltk.download('stopwords')





stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]



# # Training a logistic regression model for document classification

# Strip HTML and punctuation to speed up the GridSearch later:



X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values





tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [str.split],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [str.split],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

if 'TRAVIS' in os.environ:
    gs_lr_tfidf.verbose=2
    X_train = df.loc[:250, 'review'].values
    y_train = df.loc[:250, 'sentiment'].values
    X_test = df.loc[25000:25250, 'review'].values
    y_test = df.loc[25000:25250, 'sentiment'].values



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hasegawatakashikana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [126]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 14.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__tokenizer': [<method 'split' of 'str' objects>], 'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=T

In [128]:
print("Best parameter set:%s"%gs_lr_tfidf.best_params_)

Best parameter set:{'vect__tokenizer': <method 'split' of 'str' objects>, 'clf__C': 10.0, 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'clf__penalty': 'l2'}


In [129]:
print("CV Accuracy:%.3f"% gs_lr_tfidf.best_score_)

CV Accuracy:0.893


In [131]:
clf = gs_lr_tfidf.best_estimator_
print("Test Accuracy:%.3f"%clf.score(X_test,y_test))

Test Accuracy:0.900


In [139]:
stop = stopwords.words("english")
def tokenizer(text):
    text = re.sub("<[^>]*>","",text)
    emotions = re.findall("(?::|;|=)(?:-)?(?:\)|\(D|P)",text.lower())
    text = (re.sub("[\W]+"," ",text.lower())+" ".join(emotions).replace("-"," "))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [133]:
def stream_docs(path):
    with open(path,"r",encoding="utf-8")as csv:
        next(csv)
        for line in csv:
            text,label = line[:-3],int(line[-2])
            yield text,label

In [134]:
next(stream_docs(path="movie_data.csv"))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [135]:
def get_minibatch(doc_stream,size):
    docs,y=[],[]
    try:
        for _ in range(size):
            text,label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopAsyncIteration:
        return None,None
    return docs,y

In [141]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error="ignore",
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss="log",random_state=1,n_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [142]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train,y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:32:22


In [143]:
X_test,y_test = get_minibatch(doc_stream,size=5000)
X_test = vect.transform(X_test)
print("Accuracy: %.3f"%clf.score(X_test,y_test))

Accuracy: 0.868


In [144]:
df = pd.read_csv("movie_data.csv",encoding="utf-8")
count = CountVectorizer(stop_words="english",max_df=.1,max_features=5000)
X = count.fit_transform(df["review"].values)

In [148]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10,random_state=123,learning_method="batch")

In [149]:
X_topics = lda.fit_transform(X)

In [150]:
lda.components_.shape

(10, 5000)

In [157]:
n_top_words =5
feature_names = count.get_feature_names()
for topic_idx,topic in enumerate(lda.components_):
    print("Topic %d:"%(topic_idx+1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father girl children
Topic 3:
american dvd war music tv
Topic 4:
human audience cinema art feel
Topic 5:
police guy car dead murder
Topic 6:
horror house gore blood sex
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes season
Topic 9:
book version original effects read
Topic 10:
action fight guy guys cool


In [162]:
horror = X_topics[:,5].argsort()[::-1]
for iter_idx,movie_idx in enumerate(horror[:3]):
    print("\nHorror movie #%d:"%(iter_idx+1))
    print(df["review"][movie_idx][:300],"...")


Horror movie #1:
Emilio Miraglia's first Giallo feature, The Night Evelyn Came Out of the Grave, was a great combination of Giallo and Gothic horror - and this second film is even better! We've got more of the Giallo side of the equation this time around, although Miraglia doesn't lose the Gothic horror stylings tha ...

Horror movie #2:
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

Horror movie #3:
This film marked the end of the "serious" Universal Monsters era (Abbott and Costello meet up with the monsters later in "Abbott and Costello Meet Frankentstein"). It was a somewhat desparate, yet fun attempt to revive the classic monsters of the Wolf Man, Frankenstein's monster, and Dracula one "la ...


In [169]:
print(X_topics[4])

[0.5  0.   0.11 0.21 0.   0.   0.   0.   0.18 0.  ]


In [170]:
import pickle

In [172]:
dest = os.path.join("movieclassifier","pkl_objects")
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,open(os.path.join(dest,"stopwords.pkl"),"wb"),protocol=4)
pickle.dump(clf,open(os.path.join(dest,"classifier.pkl"),"wb"),protocol=4)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os 
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(os.path.join(cur_dir,"pkl_objects","stopwords.pkl"),"rb"))

def tokenizer(text):
    text = re.sub("<[^>]*>","",text)
    emotions = re.findall("(?::|;|=)(?:-)?(?:\)|\(D|P)",text)
    text = (re.sub("[\W]+"," ",text.lower())+" ".join(emotions).replace("-"," "))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error="ignore",n_features=2**21,preprocessor=None,tokenizer=tokenizer)