In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
ROOT = '../neuralnets/aclImdb/train/pos/'

In [3]:
reviews = []
for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [4]:
len(reviews)

12500

In [5]:
for i in range(3):
    print(reviews[i])
    print('=' * 150)

Not wishing to give *anything* away here, I would just say this technically excellent, flawlessly acted and uplifting little flic will reward the viewer with an excellent hour and a half's entertainment: It will amuse, surprise, possibly embarrass occasionally and almost certainly tug at the heartstrings from time to time, as it approaches the inevitable, but not obvious, ending without becoming clichéd or predictable in any way. Most definitely recommended.<br /><br />A previous User's Comment gives 8 out of 10 for the film and 10 out of 10 for both Branagh and Bonham-Carter's outstanding performances - I agree entirely....
Wrestlemania 14 is not often looked as one of the great Wrestlemania's but I would personally put it, in my top 5, if not the top 3. It has so many great things, and it truly signified the birth of The Attitude Era, which was WWE's best era, in my opinion. HBK has the heart of a lion, and him putting over Austin like he did, on his way out, was pure class on his pa

In [6]:
# Feature extraction

vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(reviews)

In [7]:
# NMF

N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X)
H = nmf.components_



In [8]:
# Top 10 words per topic

words = np.array(vect.get_feature_names())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)
for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,end,scene,yes,simply,spoiler,quite
Topic 2,movie,movies,watch,recommend,10,saw,acting,actors,definitely,excellent
Topic 3,film,films,director,characters,festival,cinema,scenes,work,art,plot
Topic 4,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 5,life,family,father,young,son,man,mother,old,real,children
Topic 6,good,pretty,bad,really,acting,job,liked,nice,little,story
Topic 7,war,world,documentary,people,american,history,men,soldiers,human,hitler
Topic 8,role,performance,character,best,man,cast,actor,john,play,played
Topic 9,like,really,think,don,just,people,know,say,didn,lot
Topic 10,seen,ve,time,years,saw,dvd,version,best,book,tv


In [9]:
# Create a topic mapping

topic_mapping = {
    'Topic 4': 'TV',
    'Topic 5': 'Family',
    'Topic 7': 'War',
    'Topic 12': 'Comedy',
    'Topic 13': 'Horror',
    'Topic 15': 'Martial Arts'
}

In [10]:
# Recall the document-topic matrix, W

W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
2,0.028322,0.0,0.021275,0.001176,0.000942,0.001808,0.029529,0.02225,0.006856,0.0,0.002504,0.0,0.0,0.001921,0.0,War
3,0.014833,0.022363,0.0,0.000262,0.035473,0.0,0.000446,0.0,0.016464,0.032632,0.006155,0.0,0.0,0.0,0.0,Family
15,0.0,0.0,0.0,0.009289,0.028384,0.0014,0.006928,0.017106,0.001325,0.003153,0.0,0.000422,0.0,0.006178,0.0,Family
16,0.000248,0.0,0.002036,0.0,0.015552,0.003178,0.0,0.023025,0.016249,0.0,0.005827,0.037478,0.004643,0.013323,0.010714,Comedy
18,0.029608,0.0,0.020422,0.002941,0.044098,0.010124,0.000616,0.006864,0.006026,0.0,0.013372,0.046496,0.0,0.0,0.0,Comedy
26,0.01518,0.000149,0.0,0.0,0.013366,0.012344,0.0,0.013591,0.015665,0.007913,0.0,0.038225,0.0,0.003229,0.00121,Comedy
27,0.031416,0.007336,0.0,0.002076,0.0,0.0,0.031924,0.0123,0.0,0.00402,0.008808,0.045844,0.0,0.001346,9.2e-05,Comedy
29,0.0,0.000944,0.0,0.0,0.0,0.015285,0.0,0.0,0.010599,0.0,0.0,0.017396,0.066754,0.00096,0.03612,Horror
30,0.023408,0.012247,0.017264,0.000356,0.015505,0.009911,0.000135,0.001073,0.0,0.004942,0.001184,0.047259,0.014336,0.0,0.0,Comedy
31,0.01234,0.003746,0.028889,0.0,0.011499,0.004294,0.006847,0.01097,0.003067,0.000634,0.001253,0.0,0.035176,0.007108,0.001346,Horror


In [13]:
reviews[26]

"This is the ultimate one-man show in which Eddie Murphy is at his very best. Just forget the Nutty Professor and the Distinguished Gentlemen, this is the real Eddie Murphy. His imitations of Mr. T. (pretending he is gay), Michael Jackson and other artists are killers. I think it's also quite daring to make fun of artists who where really popular in that time. My favorite act is the one where he is at his annual BBQ with the family and plays his drunken dad and aunt Bunny who falls from the stairs.<br /><br />This show is the best medicine when you feel down ! If you watch the sequel 'Raw' don't be disappointed. It's quite good too but doesn't match 'Delirious'."