In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px
from nltk.corpus import stopwords
from nltk import pos_tag
import os
from textparser import TextParser
import random
from numpy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh

In [2]:
pd.set_option('max_colwidth', 115)

In [3]:
OHCO = ['movie_id','scene_id', 'sent_num', 'token_num']
SENTS = OHCO[:3]
SCENES = OHCO[:2]
MOVIES = OHCO[:1]

In [4]:
LIB = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/LIB.csv', index_col='movie_id')
CORPUS = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/CORPUS.csv').set_index(OHCO)
VOCAB = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/VOCAB.csv', index_col = 'term_str')
DOC = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/DOC.csv', index_col = 'movie_id')

## Bring in BOW and TFIDF Functions

### BOW Function

In [5]:
def create_bag(corpus, bag):
    BOW = corpus.groupby(bag+['term_str']).term_str.count().to_frame('n')
    return BOW

### TFIDF/DFIDF Function

In [6]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log10(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log10(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log10(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log10((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

## Apply functions to CORPUS

In [7]:
BOW = create_bag(CORPUS, SCENES)

In [8]:
TFIDF, DFIDF = get_tfidf(BOW, 'max')

In [9]:
TFIDF

Unnamed: 0_level_0,term_str,0,000,0000,000i12i12,003559,01,01s,02,03,05,...,zoo,zoom,zooming,zooms,zouuu,zulu,zurbarans,zydeco,zzzzt,zzzzzzt
movie_id,scene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create the Appropriate TFIDF Table

In [10]:
pos_set = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']

In [11]:
filtered_VOCAB = VOCAB[VOCAB.max_pos.isin(pos_set)]

In [12]:
filtered_VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster,tfidf,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
05,1,2,0.000001,19.675911,VBD,1,{'VBD'},0,05,05,05,0.959321,12.471167
12500000,1,8,0.000001,19.675911,JJ,1,{'JJ'},0,12500000,12500000,12500000,0.127257,12.471167
13ths,1,5,0.000001,19.675911,NNS,1,{'NNS'},0,13th,13ths,13ths,2.078528,12.471167
167,1,3,0.000001,19.675911,NN,1,{'NN'},0,167,167,167,3.117792,12.471167
19th,3,4,0.000004,18.090948,JJ,2,"{'CD', 'JJ'}",0,19th,19th,19th,0.595930,32.658614
...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoo,7,3,0.000008,16.868556,NN,1,{'NN'},0,zoo,zoo,zoo,1.555176,59.317228
zooming,2,7,0.000002,18.675911,VBG,1,{'VBG'},0,zoom,zoom,zoom,1.014757,22.942334
zouuu,1,5,0.000001,19.675911,NN,1,{'NN'},0,zouuu,zouuu,zouuu,0.566871,12.471167
zydeco,2,6,0.000002,18.675911,NN,1,{'NN'},0,zydeco,zydeco,zydeco,2.078528,12.471167


In [13]:
filtered_VOCAB.index

Index(['05', '12500000', '13ths', '167', '19th', '1ining', '1os', '1s', '2001',
       '224',
       ...
       'zombie', 'zombies', 'zone', 'zoned', 'zonked', 'zoo', 'zooming',
       'zouuu', 'zydeco', 'zzzzzzt'],
      dtype='object', name='term_str', length=21172)

In [14]:
TFIDF_unstack = TFIDF

In [15]:
TFIDF_unstack = TFIDF_unstack[filtered_VOCAB.index]

In [16]:
filtered_DFIDF = DFIDF[filtered_VOCAB.index]

In [17]:
TFIDF_main = TFIDF_unstack[filtered_DFIDF.sort_values(ascending=False).head(1000).index]

In [18]:
TFIDF_main

Unnamed: 0_level_0,term_str,back,are,looks,then,just,be,door,see,is,have,...,park,staggers,awake,wanna,rather,moon,click,board,clearly,bar
movie_id,scene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,1,0.000000,0.036036,0.000000,0.000000,0.000000,0.046383,0.093601,0.000000,0.037109,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,2,0.015222,0.030576,0.017431,0.000000,0.058971,0.000000,0.019855,0.020023,0.062973,0.021095,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,3,0.033488,0.033634,0.038348,0.000000,0.000000,0.173165,0.000000,0.044052,0.051952,0.046410,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,4,0.000000,0.100902,0.000000,0.000000,0.129737,0.129873,0.000000,0.000000,0.051952,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,5,0.077279,0.038808,0.022124,0.072207,0.124747,0.099903,0.000000,0.025414,0.029973,0.053550,...,0.0,0.0,0.000000,0.0,0.076568,0.0,0.0,0.153136,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,107,0.000000,0.000000,0.000000,0.134099,0.092669,0.046383,0.000000,0.000000,0.037109,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
36,108,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.218402,0.000000,0.086587,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
36,109,0.000000,0.042042,0.095870,0.104299,0.000000,0.000000,0.163801,0.000000,0.064941,0.000000,...,0.0,0.0,0.165897,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
36,110,0.024503,0.000000,0.028059,0.061053,0.015822,0.015838,0.047942,0.000000,0.063357,0.000000,...,0.0,0.0,0.048555,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0


In [19]:
collapsed = TFIDF_main.groupby(by = 'movie_id').mean()

In [20]:
collapsed

term_str,back,are,looks,then,just,be,door,see,is,have,...,park,staggers,awake,wanna,rather,moon,click,board,clearly,bar
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.026178,0.031365,0.021349,0.011329,0.016316,0.018253,0.024985,0.013033,0.03563,0.02179,...,0.0,0.000257,0.001002,0.001915,0.002512,0.0,0.004114,0.009182,0.002765,0.0
1,0.032777,0.053316,0.0319,0.039534,0.025595,0.021786,0.019021,0.017201,0.069666,0.053519,...,0.001838,0.0,0.0,0.001663,0.00123,0.0,0.0,0.0,0.006587,0.032177
2,0.027985,0.036697,0.037452,0.006273,0.024246,0.02911,0.015505,0.03105,0.054932,0.028549,...,0.002954,0.001471,0.010405,0.000829,0.001349,0.00584,0.0,0.001245,0.002168,0.005832
3,0.016656,0.016829,0.019579,0.015983,0.03222,0.044644,0.018317,0.017076,0.038076,0.02603,...,0.0,0.000483,0.001264,0.0,0.002486,0.0,0.000737,0.000983,0.0,0.004441
4,0.020085,0.038254,0.014164,0.011235,0.017543,0.022186,0.022825,0.015934,0.0357,0.020866,...,0.0,0.002061,0.0,0.001276,0.0,0.000601,0.004193,0.0,0.016099,0.001281
5,0.016856,0.018355,0.023761,0.01266,0.013169,0.02329,0.024993,0.007837,0.02817,0.009214,...,0.0,0.0,0.00115,0.003505,0.000687,0.00701,0.000825,0.005007,0.001454,0.000782
6,0.031878,0.018501,0.024819,0.039588,0.032494,0.024793,0.025351,0.009133,0.027423,0.017336,...,0.003437,0.004952,0.005034,0.000371,0.0,0.0,0.004498,0.002249,0.0,0.000325
7,0.032987,0.024189,0.008476,0.017206,0.026529,0.017185,0.029477,0.008634,0.034511,0.015359,...,0.0,0.001939,0.0,0.0,0.003463,0.0,0.000725,0.010449,0.004111,0.0
8,0.025096,0.010213,0.026,0.009364,0.010808,0.034502,0.016418,0.021006,0.029987,0.00597,...,0.0,0.00781,0.001468,0.0,0.0,0.0,0.0,0.000851,0.0,0.002206
9,0.024016,0.021043,0.023366,0.013354,0.013146,0.018299,0.052514,0.017756,0.051475,0.01678,...,0.0,0.001719,0.0,0.016802,0.000857,0.036413,0.0,0.000965,0.001669,0.0


# LDA

In [21]:
n_top_terms = 7

In [22]:
colors = "YlGnBu"

## Pragmas

In [23]:
import warnings
warnings.filterwarnings('ignore')

# Paragraph topic model

In [24]:
BAG = SCENES

# Prepare Data

## Convert `TOKENS` back to `DOCS`

Scikit Learn wants an F1 style corpus. We create onefrom our annotated TOKEN table, keeping only regular nouns.

In [25]:
TOKENS = CORPUS

In [26]:
BAG

['movie_id', 'scene_id']

## Filter for Nouns

In [27]:
DOCS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})

In [28]:
DOCS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str
movie_id,scene_id,Unnamed: 2_level_1
0,1,crypt searchlights stream windows metal forms shuttle tolling bell hull torch bursts metal sparks room torch cu...
0,2,bed wan female backrest equipment cheeriness dont viewport eyes floods shield slides ceiling breathtaking vista...
0,3,streams shafts stand poplars verdant meadow stalks bird hopping leaves steps sort cinerama video loop sits benc...
0,4,elevator doors part conversation dollying corridor accurate heavyweights there commerce commission administrati...
0,5,people what hours ways story board inquiry conference table suits faces goon personnel mugshot file prints face...


## Create Vector Space

We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.

In [29]:
count_engine = CountVectorizer(max_features=4000, stop_words='english')
count_model = count_engine.fit_transform(DOCS.doc_str)
TERMS = count_engine.get_feature_names_out()

In [30]:
VOCAB = pd.DataFrame(index=TERMS)
VOCAB.index.name = 'term_str'

In [31]:
DTM = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)

In [32]:
VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
DOCS['term_count'] = DTM.sum(1)

In [33]:
DOCS.term_count.describe()

count    5624.000000
mean       24.465861
std        33.465799
min         0.000000
25%         5.000000
50%        12.000000
75%        30.000000
max       355.000000
Name: term_count, dtype: float64

# Generate Model

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [34]:
lda_engine = LDA(n_components=20, max_iter=5, learning_offset=50, random_state=0)

## Topic Names

In [35]:
TNAMES = [f"T{str(x).zfill(len(str(20)))}" for x in range(20)]

## THETA

In [36]:
lda_model = lda_engine.fit_transform(count_model)

In [37]:
THETA_chap = pd.DataFrame(lda_model, index=DOCS.index)
THETA_chap.columns.name = 'topic_id'
THETA_chap.columns = TNAMES

In [38]:
THETA_chap

Unnamed: 0_level_0,Unnamed: 1_level_0,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
movie_id,scene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0.001220,0.001220,0.001220,0.826367,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.001220,0.151682,0.001220,0.001220,0.001220,0.001220
0,2,0.000459,0.000459,0.000459,0.184937,0.000459,0.217568,0.000459,0.000459,0.094858,0.000459,0.000459,0.000459,0.000459,0.000459,0.074908,0.000459,0.222756,0.198551,0.000459,0.000459
0,3,0.000847,0.000847,0.000847,0.367078,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.000847,0.617668,0.000847,0.000847
0,4,0.338077,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.005000,0.571923,0.005000
0,5,0.000538,0.000538,0.000538,0.000538,0.000538,0.000538,0.000538,0.000538,0.000538,0.000538,0.000538,0.569179,0.000538,0.000538,0.000538,0.000538,0.421144,0.000538,0.000538,0.000538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,107,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.702899,0.001250,0.001250,0.001250,0.001250,0.274601,0.001250
36,108,0.010000,0.010000,0.010000,0.810000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000
36,109,0.001563,0.001563,0.001563,0.799062,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.172813,0.001563,0.001563,0.001563,0.001563
36,110,0.000450,0.000450,0.000450,0.717348,0.000450,0.000450,0.000450,0.187007,0.000450,0.000450,0.000450,0.000450,0.000450,0.000450,0.000450,0.000450,0.087988,0.000450,0.000450,0.000450


In [39]:
THETA_chap.to_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/THETA.csv')

## PHI

In [40]:
PHI_chap = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
PHI_chap.index.name = 'topic_id'
PHI_chap.columns.name  = 'term_str'

In [41]:
PHI_chap.T

topic_id,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
17,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,1.050000,0.05000,0.050000,0.050000,0.050000,0.050000,5.050000,0.050000,0.050000,0.050000,0.050000,0.050000
38,0.050000,0.050000,0.050000,0.050000,0.050000,1.050000,0.050000,1.050000,0.050000,0.05000,0.050000,0.050000,0.050000,4.040881,0.050000,0.050000,0.050000,0.050000,1.059119,0.050000
aback,0.808993,1.050000,0.050000,0.050000,0.050000,4.573131,1.963123,0.723274,0.050000,0.05000,1.021290,0.050000,0.050000,1.147629,0.050000,0.050000,1.112560,0.050000,0.050000,0.050000
abdomen,0.050000,3.512081,0.050000,0.527435,4.034930,0.050000,2.112731,5.554375,0.050000,0.05000,1.050000,0.050000,0.050000,0.050000,0.055108,0.050000,1.553339,0.050000,0.050000,0.050000
ability,0.050000,0.050000,3.995939,0.050000,0.050619,1.050000,0.050000,0.050000,0.050000,0.05000,1.050000,0.691284,0.050000,1.050000,0.050000,0.050000,0.050000,4.462158,0.050000,0.050000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
youve,0.050000,6.002779,2.530335,4.841919,0.054753,1.570678,0.050000,0.050000,0.093859,9.53518,0.475682,0.053641,2.505868,1.359599,0.050000,2.504379,1.599348,13.435731,4.146617,0.089631
zero,0.050000,1.050000,1.437928,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,9.05000,1.662072,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,1.050000,0.050000
zombie,0.050000,1.050000,1.050000,0.050000,0.050000,2.050000,0.900707,0.050000,0.050000,0.05000,1.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,1.199293
zoo,0.050000,0.050000,0.050000,2.380522,0.050000,0.050000,2.050000,0.050000,0.050000,0.05000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,2.719478,0.050000,0.050000,0.050000


In [42]:
PHI_chap.to_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/PHI.csv')

# Inspect Results

## Create `TOPICS` and get Top Terms per Topic

In [43]:
TOPICS_chap = PHI_chap.stack().to_frame('topic_weight').groupby('topic_id')\
    .apply(lambda x: x.sort_values('topic_weight', ascending=False)\
        .head(n_top_terms).reset_index().drop('topic_id', axis=1)['term_str'])

In [44]:
TOPICS_chap

term_str,0,1,2,3,4,5,6
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
T00,camera,corridor,bg,floor,caml,hands,vo
T01,ship,light,man,bed,beat,door,water
T02,face,eyes,hair,hand,room,blood,hands
T03,door,room,eyes,floor,hall,hand,stairs
T04,door,house,eyes,kitchen,bathroom,room,walks
T05,beat,eyes,face,way,time,man,head
T06,arms,car,eyes,body,window,time,look
T07,car,body,face,eyes,hand,creature,feet
T08,face,inside,way,hand,wall,look,gun
T09,starling,dog,radio,ice,time,men,dogs


In [45]:
TOPICS_chap['label'] = TOPICS_chap.apply(lambda x: x.name + ' ' + ', '.join(x[:n_top_terms]), 1)

In [46]:
print(TOPICS_chap.label.values)

['T00 camera, corridor, bg, floor, caml, hands, vo'
 'T01 ship, light, man, bed, beat, door, water'
 'T02 face, eyes, hair, hand, room, blood, hands'
 'T03 door, room, eyes, floor, hall, hand, stairs'
 'T04 door, house, eyes, kitchen, bathroom, room, walks'
 'T05 beat, eyes, face, way, time, man, head'
 'T06 arms, car, eyes, body, window, time, look'
 'T07 car, body, face, eyes, hand, creature, feet'
 'T08 face, inside, way, hand, wall, look, gun'
 'T09 starling, dog, radio, ice, time, men, dogs'
 'T10 way, hand, face, door, eyes, room, steps'
 'T11 water, head, eyes, hand, man, face, dog'
 'T12 wall, spiders, window, tv, face, water, town'
 'T13 night, car, sir, window, house, police, creatures'
 'T14 door, window, hand, street, car, floor, apartment'
 'T15 vo, eyes, tape, bed, way, hand, body'
 'T16 eyes, face, head, hand, floor, os, screen'
 'T17 time, room, head, eyes, thing, hand, day'
 'T18 phone, beat, os, table, hand, desk, room'
 'T19 car, road, door, truck, head, feet, cruise

## Sort Topics by Doc Weight

In [47]:
TOPICS_chap['doc_weight_sum'] = THETA_chap.sum()
TOPICS_chap['term_freq'] = PHI_chap.sum(1) / PHI_chap.sum(1).sum()

In [48]:
TOPICS_chap

term_str,0,1,2,3,4,5,6,label,doc_weight_sum,term_freq
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
T00,camera,corridor,bg,floor,caml,hands,vo,"T00 camera, corridor, bg, floor, caml, hands, vo",247.184889,0.029099
T01,ship,light,man,bed,beat,door,water,"T01 ship, light, man, bed, beat, door, water",284.705459,0.040364
T02,face,eyes,hair,hand,room,blood,hands,"T02 face, eyes, hair, hand, room, blood, hands",303.041983,0.061379
T03,door,room,eyes,floor,hall,hand,stairs,"T03 door, room, eyes, floor, hall, hand, stairs",454.938226,0.089797
T04,door,house,eyes,kitchen,bathroom,room,walks,"T04 door, house, eyes, kitchen, bathroom, room, walks",320.928509,0.051432
T05,beat,eyes,face,way,time,man,head,"T05 beat, eyes, face, way, time, man, head",238.27654,0.060218
T06,arms,car,eyes,body,window,time,look,"T06 arms, car, eyes, body, window, time, look",199.715572,0.033627
T07,car,body,face,eyes,hand,creature,feet,"T07 car, body, face, eyes, hand, creature, feet",283.323926,0.056917
T08,face,inside,way,hand,wall,look,gun,"T08 face, inside, way, hand, wall, look, gun",234.76751,0.039063
T09,starling,dog,radio,ice,time,men,dogs,"T09 starling, dog, radio, ice, time, men, dogs",210.927647,0.026335


In [49]:
TOPICS_chap.to_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/TOPICS.csv')

# Explore Topics by Decade

Use the LIB table to get author info.

In [50]:
LIB['movie_title'] = LIB.movie_title.str.split(', ').str[0].str.lower()

Add mean topic weight for each topic by author.

In [51]:
DECADES = sorted(LIB.decade.value_counts().index.to_list())

In [52]:
TOPICS_chap[DECADES] = THETA_chap.join(LIB, on='movie_id').groupby('decade')[TNAMES].mean().T

In [53]:
TOPICS_chap

term_str,0,1,2,3,4,5,6,label,doc_weight_sum,term_freq,1980,1990,2000
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
T00,camera,corridor,bg,floor,caml,hands,vo,"T00 camera, corridor, bg, floor, caml, hands, vo",247.184889,0.029099,0.073252,0.027689,0.026353
T01,ship,light,man,bed,beat,door,water,"T01 ship, light, man, bed, beat, door, water",284.705459,0.040364,0.043071,0.028421,0.067081
T02,face,eyes,hair,hand,room,blood,hands,"T02 face, eyes, hair, hand, room, blood, hands",303.041983,0.061379,0.049363,0.07941,0.046203
T03,door,room,eyes,floor,hall,hand,stairs,"T03 door, room, eyes, floor, hall, hand, stairs",454.938226,0.089797,0.068156,0.082881,0.090837
T04,door,house,eyes,kitchen,bathroom,room,walks,"T04 door, house, eyes, kitchen, bathroom, room, walks",320.928509,0.051432,0.056059,0.060463,0.056385
T05,beat,eyes,face,way,time,man,head,"T05 beat, eyes, face, way, time, man, head",238.27654,0.060218,0.036958,0.043316,0.046545
T06,arms,car,eyes,body,window,time,look,"T06 arms, car, eyes, body, window, time, look",199.715572,0.033627,0.035561,0.035301,0.035564
T07,car,body,face,eyes,hand,creature,feet,"T07 car, body, face, eyes, hand, creature, feet",283.323926,0.056917,0.052567,0.047117,0.049986
T08,face,inside,way,hand,wall,look,gun,"T08 face, inside, way, hand, wall, look, gun",234.76751,0.039063,0.040706,0.063433,0.032831
T09,starling,dog,radio,ice,time,men,dogs,"T09 starling, dog, radio, ice, time, men, dogs",210.927647,0.026335,0.0404,0.019535,0.043157


In [54]:
TOPICS_chap.sort_values(1980, ascending = False).head(3)

term_str,0,1,2,3,4,5,6,label,doc_weight_sum,term_freq,1980,1990,2000
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
T00,camera,corridor,bg,floor,caml,hands,vo,"T00 camera, corridor, bg, floor, caml, hands, vo",247.184889,0.029099,0.073252,0.027689,0.026353
T03,door,room,eyes,floor,hall,hand,stairs,"T03 door, room, eyes, floor, hall, hand, stairs",454.938226,0.089797,0.068156,0.082881,0.090837
T18,phone,beat,os,table,hand,desk,room,"T18 phone, beat, os, table, hand, desk, room",392.113211,0.089722,0.066937,0.062149,0.075512


In [55]:
TOPICS_chap.sort_values(1990, ascending = False).head(3)

term_str,0,1,2,3,4,5,6,label,doc_weight_sum,term_freq,1980,1990,2000
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
T03,door,room,eyes,floor,hall,hand,stairs,"T03 door, room, eyes, floor, hall, hand, stairs",454.938226,0.089797,0.068156,0.082881,0.090837
T02,face,eyes,hair,hand,room,blood,hands,"T02 face, eyes, hair, hand, room, blood, hands",303.041983,0.061379,0.049363,0.07941,0.046203
T17,time,room,head,eyes,thing,hand,day,"T17 time, room, head, eyes, thing, hand, day",324.275656,0.081621,0.040361,0.073429,0.065263


In [56]:
TOPICS_chap.sort_values(2000, ascending = False).head(3)

term_str,0,1,2,3,4,5,6,label,doc_weight_sum,term_freq,1980,1990,2000
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
T03,door,room,eyes,floor,hall,hand,stairs,"T03 door, room, eyes, floor, hall, hand, stairs",454.938226,0.089797,0.068156,0.082881,0.090837
T18,phone,beat,os,table,hand,desk,room,"T18 phone, beat, os, table, hand, desk, room",392.113211,0.089722,0.066937,0.062149,0.075512
T01,ship,light,man,bed,beat,door,water,"T01 ship, light, man, bed, beat, door, water",284.705459,0.040364,0.043071,0.028421,0.067081
