In [1]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import os
import glob
from parse_corpus import parse_script_lines, parse_script_characters, parse_dialogue
from comparing_strings import getClosestSentiment

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zroga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [3]:
characters = parse_script_characters('Brazil_script.html')

print(characters)

chr_lines = parse_dialogue('Brazil_script.html')
#print(chr_lines)

{'INTERVIEWER': 7, 'HELPMANN': 29, 'JILL': 95, 'MRS. BUTTLE': 8, 'GIRL BUTTLE': 6, 'OFFICIAL': 5, 'CHARLIE': 4, 'BILL': 3, 'MR. KURTZMAN': 3, 'SAM': 356, 'THE FORCES': 2, 'JACK': 71, 'PORTER': 15, 'KURTZMAN': 37, 'DOCTOR': 7, 'MOTHER': 29, "MAITRE'D": 2, 'SPIRO': 15, 'MRS. TERRAIN': 21, 'SHIRLEY': 6, 'PHONE VOICE': 5, 'GENT': 10, 'TUTTLE': 18, 'SPOOR': 31, 'DOWSER': 24, 'NEIGHBOUR': 2, 'GIRL': 7, 'DR. JAFFE': 6, 'VOICE': 4, 'ALISON': 3, 'WARREN': 9, 'LIME': 16, 'TYPIST': 5, 'GUARD': 7, 'GUARD A': 7, 'GUARD B': 2, 'MATTHEWS': 5, 'OFFICIAL A': 3, "SAM'S POV": 2}


In [39]:

" ".join([str(item) + '. ' for item in chr_lines['HELPMANN']])

'Bad sportsmanship. A ruthless minority of people seems to have forgotten certain good old fashioned virtues. They just can\'t stand seeing the other fellow win. If these people would just play the game, instead of standing on the touch line heckling .  In fact, killing people  they\'d get a lot more out of life. .  David... in a free society information is the name of the game. You can\'t win the game if you\'re a man short. .  I understand this concern on behalf of the tax-payers. People want value for money and a cost-effective service. .  That is why we always insist on the principle of Information Retrieval Charges. These terrorists are not pulling their weight, and it\'s absolutely right and fair that those found guilty should pay for their periods of detention and the Information Retrieval Procedures used in their interrogation. .  On yes. Our morale is much higher than theirs, we\'re fielding all their strokes, running a lot of them out, and pretty consistently knocking them fo

In [5]:
data = chr_lines['HELPMANN']
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['bad', 'sportsmanship', 'ruthless', 'minority', 'people', 'seems', 'forgotten', 'certain', 'good', 'old', 'fashioned', 'virtues', 'stand', 'seeing', 'fellow', 'win', 'people', 'would', 'play', 'game', 'instead', 'standing', 'touch', 'line', 'heckling']


In [6]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)]


In [25]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.026*"help" + 0.026*"information" + 0.026*"know" + 0.026*"team" + '
  '0.026*"take" + 0.026*"retrieval" + 0.026*"onto" + 0.026*"wanted" + '
  '0.026*"would" + 0.026*"promised"'),
 (1,
  '0.045*"sam" + 0.023*"play" + 0.023*"feel" + 0.023*"bottle" + 0.023*"water" '
  '+ 0.023*"game" + 0.023*"laid" + 0.023*"assure" + 0.023*"understand" + '
  '0.023*"barley"'),
 (2,
  '0.055*"retrieval" + 0.055*"information" + 0.037*"sam" + 0.020*"guilty" + '
  '0.020*"right" + 0.020*"always" + 0.020*"absolutely" + 0.020*"periods" + '
  '0.020*"weight" + 0.020*"fair"'),
 (3,
  '0.055*"keep" + 0.029*"got" + 0.029*"ball" + 0.029*"waiting" + 0.029*"good" '
  '+ 0.029*"luck" + 0.029*"post" + 0.029*"fall" + 0.029*"last" + 0.029*"see"'),
 (4,
  '0.005*"help" + 0.005*"sam" + 0.005*"jack" + 0.005*"barbara" + '
  '0.005*"information" + 0.005*"gillian" + 0.005*"say" + 0.005*"retrieval" + '
  '0.005*"hello" + 0.005*"beginner"'),
 (5,
  '0.064*"sam" + 0.033*"point" + 0.033*"yes" + 0.033*"bit" + 0.033*"certain