In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px
from nltk.corpus import stopwords
from nltk import pos_tag
import os
from textparser import TextParser
import random
from numpy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
OHCO = ['movie_id','scene_id', 'sent_num', 'token_num']
TOKES = OHCO[:4]
SENTS = OHCO[:3]
SCENES = OHCO[:2]
MOVIES = OHCO[:1]

In [4]:
LIB = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/LIB.csv', index_col='movie_id')
CORPUS = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/CORPUS.csv').set_index(OHCO)
VOCAB = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/VOCAB_noTForDF.csv', index_col = 'term_str')
DOC = pd.read_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/DOC.csv', index_col = 'movie_id')

## Bring in BOW and TFIDF Functions

### BOW Function

In [5]:
def create_bag(corpus, bag):
    BOW = corpus.groupby(bag+['term_str']).term_str.count().to_frame('n')
    return BOW

### TFIDF Function

In [6]:
def TFIDF(BOW, tf_method):
    BOW_copy = BOW.copy()
    DTCM = BOW.n.unstack()
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    DF = DTCM.count()
    N = DTCM.shape[0]
    IDF = np.log2(N / DF)
    TFIDF = TF * IDF
    DFIDF = DF * IDF
    BOW_copy['tf'] = TF.stack()
    BOW_copy['tfidf'] = TFIDF.stack()
    
    return BOW_copy

## DFIDF Function

In [7]:
def DFIDF(BOW, tf_method):
    BOW_copy = BOW.copy()
    DTCM = BOW.n.unstack()
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    DF = DTCM.count()
    N = DTCM.shape[0]
    IDF = np.log2(N / DF)
    DFIDF = DF * IDF
    
    return DFIDF

## Apply functions to Austen

In [8]:
BOW = create_bag(CORPUS, SCENES)

In [9]:
TFIDF = TFIDF(BOW, 'max')

In [10]:
DFIDF = DFIDF(BOW, 'max')

## Create the Appropriate TFIDF Table

In [11]:
pos_set = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']

In [12]:
filtered_VOCAB = VOCAB[VOCAB.max_pos.isin(pos_set)]

In [13]:
filtered_VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
05,1,2,0.000001,19.675911,VBD,1,{'VBD'},0,05,05,05
12500000,1,8,0.000001,19.675911,JJ,1,{'JJ'},0,12500000,12500000,12500000
13ths,1,5,0.000001,19.675911,NNS,1,{'NNS'},0,13th,13ths,13ths
167,1,3,0.000001,19.675911,NN,1,{'NN'},0,167,167,167
19th,3,4,0.000004,18.090948,JJ,2,"{'CD', 'JJ'}",0,19th,19th,19th
...,...,...,...,...,...,...,...,...,...,...,...
zoo,7,3,0.000008,16.868556,NN,1,{'NN'},0,zoo,zoo,zoo
zooming,2,7,0.000002,18.675911,VBG,1,{'VBG'},0,zoom,zoom,zoom
zouuu,1,5,0.000001,19.675911,NN,1,{'NN'},0,zouuu,zouuu,zouuu
zydeco,2,6,0.000002,18.675911,NN,1,{'NN'},0,zydeco,zydeco,zydeco


In [14]:
filtered_VOCAB.index

Index(['05', '12500000', '13ths', '167', '19th', '1ining', '1os', '1s', '2001',
       '224',
       ...
       'zombie', 'zombies', 'zone', 'zoned', 'zonked', 'zoo', 'zooming',
       'zouuu', 'zydeco', 'zzzzzzt'],
      dtype='object', name='term_str', length=21172)

In [15]:
TFIDF_unstack = TFIDF.tfidf.unstack(fill_value = 0)

In [16]:
TFIDF_unstack = TFIDF_unstack[filtered_VOCAB.index]

In [17]:
filtered_DFIDF = DFIDF[filtered_VOCAB.index]

In [18]:
TFIDF_main = TFIDF_unstack[filtered_DFIDF.sort_values(ascending=False).head(1000).index]

In [19]:
TFIDF_main

Unnamed: 0_level_0,term_str,back,are,looks,then,just,be,door,see,is,have,...,park,staggers,awake,wanna,rather,moon,click,board,clearly,bar
movie_id,scene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,1,0.000000,0.119710,0.000000,0.000000,0.000000,0.154082,0.310935,0.000000,0.123273,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,2,0.050565,0.101572,0.057904,0.000000,0.195899,0.000000,0.065956,0.066516,0.209191,0.070077,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,3,0.111243,0.111729,0.127389,0.000000,0.000000,0.575240,0.000000,0.146336,0.172582,0.154170,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,4,0.000000,0.335188,0.000000,0.000000,0.430977,0.431430,0.000000,0.000000,0.172582,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
0,5,0.256716,0.128918,0.073494,0.239867,0.414401,0.331869,0.000000,0.084425,0.099567,0.177889,...,0.0,0.0,0.000000,0.0,0.254353,0.0,0.0,0.508707,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,107,0.000000,0.000000,0.000000,0.445468,0.307841,0.154082,0.000000,0.000000,0.123273,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
36,108,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.725515,0.000000,0.287637,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
36,109,0.000000,0.139662,0.318472,0.346475,0.000000,0.000000,0.544137,0.000000,0.215728,0.000000,...,0.0,0.0,0.551099,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
36,110,0.081398,0.000000,0.093211,0.202815,0.052558,0.052613,0.159259,0.000000,0.210466,0.000000,...,0.0,0.0,0.161297,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0


In [20]:
collapsed = TFIDF_main.groupby(by = 'movie_id').mean()

In [21]:
collapsed

term_str,back,are,looks,then,just,be,door,see,is,have,...,park,staggers,awake,wanna,rather,moon,click,board,clearly,bar
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.086962,0.104191,0.07092,0.037633,0.054199,0.060636,0.082997,0.043295,0.118359,0.072386,...,0.0,0.000853,0.003328,0.006362,0.008344,0.0,0.013667,0.030503,0.009185,0.0
1,0.108884,0.177111,0.105968,0.13133,0.085025,0.07237,0.063185,0.057139,0.231427,0.177785,...,0.006106,0.0,0.0,0.005525,0.004087,0.0,0.0,0.0,0.021881,0.106889
2,0.092966,0.121905,0.124413,0.020837,0.080543,0.096701,0.051508,0.103146,0.18248,0.094837,...,0.009814,0.004888,0.034564,0.002752,0.00448,0.019401,0.0,0.004136,0.007201,0.019374
3,0.055331,0.055905,0.065041,0.053093,0.107033,0.148303,0.060848,0.056726,0.126485,0.08647,...,0.0,0.001603,0.004199,0.0,0.008258,0.0,0.002449,0.003266,0.0,0.014752
4,0.066722,0.127076,0.047052,0.037321,0.058278,0.073701,0.075821,0.052932,0.118592,0.069317,...,0.0,0.006848,0.0,0.004239,0.0,0.001995,0.013929,0.0,0.053479,0.004255
5,0.055994,0.060975,0.078934,0.042057,0.043747,0.077367,0.083026,0.026033,0.093579,0.030609,...,0.0,0.0,0.003821,0.011643,0.002283,0.023286,0.00274,0.016633,0.00483,0.002597
6,0.105898,0.061459,0.082446,0.131507,0.107941,0.08236,0.084214,0.030339,0.091096,0.05759,...,0.011419,0.016451,0.016721,0.001231,0.0,0.0,0.014942,0.007471,0.0,0.001078
7,0.10958,0.080355,0.028156,0.057157,0.088127,0.057087,0.097922,0.028682,0.114643,0.051021,...,0.0,0.006441,0.0,0.0,0.011503,0.0,0.00241,0.034711,0.013657,0.0
8,0.083368,0.033928,0.086371,0.031107,0.035905,0.114614,0.054539,0.06978,0.099614,0.019831,...,0.0,0.025944,0.004875,0.0,0.0,0.0,0.0,0.002826,0.0,0.007329
9,0.079779,0.069903,0.077619,0.044361,0.04367,0.060787,0.174449,0.058984,0.170996,0.055742,...,0.0,0.005711,0.0,0.055814,0.002848,0.120963,0.0,0.003204,0.005545,0.0


In [22]:
TFIDF_main.to_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/TFIDF.csv')

In [23]:
DFIDF.to_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/DFIDF.csv')

# Add TFIDF and DFIDF to VOCAB table

In [24]:
VOCAB['tfidf'] = TFIDF.groupby('term_str').mean('tfidf')['tfidf']
VOCAB['dfidf'] = DFIDF


In [25]:
VOCAB.to_csv('/Users/theothormann/Desktop/Data Science/Spring/DS5001/FinalData/VOCAB.csv')

In [26]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster,tfidf,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,32,1,0.000038,14.675911,CD,3,"{'CD', 'NN', 'VBP'}",0,0,0,0,0.660010,202.038913
000,1,3,0.000001,19.675911,CD,1,{'CD'},0,000,000,000,0.959321,12.471167
0000,1,4,0.000001,19.675911,CD,1,{'CD'},0,0000,0000,0000,12.471167,12.471167
000i12i12,1,9,0.000001,19.675911,CD,1,{'CD'},0,000i12i12,000i12i12,000i12i12,0.101392,12.471167
003559,1,6,0.000001,19.675911,CD,1,{'CD'},0,003559,003559,003559,3.117792,12.471167
...,...,...,...,...,...,...,...,...,...,...,...,...,...
zulu,1,4,0.000001,19.675911,CD,1,{'CD'},0,zulu,zulu,zulu,1.385685,12.471167
zurbarans,1,9,0.000001,19.675911,NNP,1,{'NNP'},0,zurbaran,zurbaran,zurb,0.244533,12.471167
zydeco,2,6,0.000002,18.675911,NN,1,{'NN'},0,zydeco,zydeco,zydeco,2.078528,12.471167
zzzzt,1,5,0.000001,19.675911,NNP,1,{'NNP'},0,zzzzt,zzzzt,zzzzt,0.566871,12.471167


# See highest TFIDF terms

In [30]:
VOCAB = VOCAB[VOCAB['max_pos'] != 'NNP']
VOCAB.sort_values('dfidf', ascending=False).head(25)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster,tfidf,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
her,8704,3,0.010392,6.588448,PRP$,14,"{'NN', 'PDT', 'PRP$', 'VBD', 'PRP', 'NNP', 'VB...",1,her,her,her,0.492211,3013.216435
you,9752,3,0.011643,6.424428,PRP,20,"{'NN', 'CC', 'PRP', 'VB', 'FW', 'JJR', 'RB', '...",1,you,you,you,0.453602,3010.821233
from,3839,4,0.004583,7.769396,IN,10,"{'NN', 'VBD', 'NNP', 'VBZ', 'VB', 'IN', 'VBP',...",1,from,from,from,0.264047,3009.744026
she,6315,3,0.007539,7.051344,PRP,11,"{'NN', 'VBD', 'PRP', 'NNP', 'VBZ', 'VB', 'VBP'...",1,she,she,she,0.465421,3006.319236
into,3548,4,0.004236,7.88312,IN,8,"{'NN', 'VBD', 'NNP', 'VBZ', 'VB', 'IN', 'VBP',...",1,into,into,into,0.290933,3005.478071
i,7443,1,0.008886,6.814242,PRP,12,"{'NN', 'PRP', 'NNP', 'VBZ', 'VB', 'VBP', 'IN',...",1,i,i,i,0.493024,2997.636906
with,5161,4,0.006162,7.342476,IN,11,"{'NN', 'WRB', 'VBD', 'NNP', 'VBZ', 'VB', 'IN',...",1,with,with,with,0.268673,2995.261553
for,3566,3,0.004257,7.87582,IN,10,"{'NN', 'VBD', 'NNP', 'VB', 'IN', 'VBP', 'FW', ...",1,for,for,for,0.282503,2991.791252
out,4890,3,0.005838,7.420292,RP,13,"{'NN', 'VBD', 'RP', 'NNP', 'PRP', 'VBZ', 'VB',...",1,out,out,out,0.25396,2991.404072
his,7832,3,0.00935,6.740745,PRP$,7,"{'NN', 'PRP$', 'NNP', 'VB', 'RB', 'JJ', 'NNS'}",1,hi,his,his,0.375924,2988.457251
