In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim, string
from nltk.corpus import stopwords
from collections import Counter
from gensim.summarization import summarize
from gensim.summarization import keywords
import pickle

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from tqdm import tqdm

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")

%matplotlib inline

In [2]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    
    # replace ; keeps the sentence structure
    text = text.replace(" ; ", "\n")
    table = str.maketrans('', '', string.punctuation)
    # get rid of punctuation
    text = text.translate(table)
    # get rid of newlines
    text = text.strip().replace("\n", ". ").replace("\r", ".")
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    # lowercase
    text = text.lower()

    return text

def calc_score(ref, comp, debug=False):
    '''gives the number of items in ref that is also found in comp'''
    ## check if it is a list of strings
    if not isinstance(ref, list):
        ref = str(ref).split()
    if not isinstance(comp, list):
        comp = str(comp).split()
        
    s_ref = set(ref)
    s_comp = set(comp)
    s_inter = s_comp.intersection(s_ref)
    if debug:
        print(s_ref, len(s_ref))
        print(s_comp)
        print(s_inter, len(s_inter))
    if len(s_ref) == 0:
        return 0
    else:
        return len(s_inter)/len(s_ref)

In [3]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve", "\n", "", " ", "\n\n", "npr"]

def lemming(data, keeptype=["NOUN", "PROPN", "NUM", "ADJ", "ADV"], doalpha=True, dostop=True):
    tokens = []
    for tok in data:
        
        # stoplist the tokens
        if dostop:
            if tok.text not in STOPLIST:
                pass
            else:
                continue
            ##check if the token is stopword
            if not tok.is_stop:
                pass
            else:
                continue
        else:
            pass
        
        # stoplist symbols
        if tok.text not in SYMBOLS:
            pass
        else: 
            continue
        
        ##check if the token is alpha
        if doalpha:
            if tok.is_alpha:
                pass
            else:
                continue
        else:
            pass
        
        ##check if the token is noun
        if len(keeptype) > 1:
            if tok.pos_ in keeptype:
                pass
            else:
                continue
        else:
            pass
    
        # lemmatize
        if tok.lemma_ != "-PRON-" :
            tokens.append(tok.lemma_.lower().strip())
        else:
            tokens.append(tok.lower_)
    
    # remove large strings of whitespace
    return tokens

In [4]:
# Initialize spacy 'en' model
nlp = spacy.load('en')

# Initialize vectorizer
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=1,                        # minimum reqd occurences of a word 
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

In [None]:
with open(path + 'info.pkl', 'rb') as f:
    alldic = pickle.load(f)

In [None]:
all_token = []
all_token_alltype = []
all_sumtoken = []
all_gensim = []
N = 419 ##304 NPR, 96 conv, 419 story
path = 'Data/NPR_story/'
#for i in tqdm(range(N)):
info_dic = {}
for i in range(1):
    test_text = []
    test_sumtext   = []
    info_text = []
    test_scores = []
    test_lemlength = []
    test_length = []
    test_timelength = []
    test_maxvolume = []
    test_avevolume = []
    with open(path + str(i) + '_trans.txt', 'r') as myfile:
        test_text = cleanText(myfile.read()).split(".")
    with open(path + str(i) + '.txt', 'r') as myfile:
        test_sumtext = cleanText(myfile.read()).split(".")[0]
   
    for j in range(len(test_text)):
        info_text.append(mydic[path + str(i) + '_' + str(j) + '.wav'])
        info_dic[path + str(i) + '_' + str(j) + '.wav'] = {}
        info_dic[path + str(i) + '_' + str(j) + '.wav'].update({'text': test_text[j]})
        info_dic[path + str(i) + '_' + str(j) + '.wav'].update({'sumtext': test_sumtext})
        info_dic[path + str(i) + '_' + str(j) + '.wav'].update({'sumtext': test_sumtext})
    #print(test_sumtext[-1])
    #print(info_text[-1])
    
    ## summ tocken
    all_sumtoken.append(lemming(nlp(test_sumtext), doalpha=True, dostop=True, keeptype=[]))
    #print(all_sumtoken[-1])
    
    for j in range(len(test_text)):
        temp_text = test_text[j]
        print(temp_text.split(" "))
        test_token = lemming(nlp("".join(temp_text)), doalpha=True, dostop=True, keeptype=[])
        test_scores.append(calc_score(test_token, all_sumtoken[-1]))
        info_dic[path + str(i) + '_' + str(j) + '.wav'].update({'score': calc_score(test_token, all_sumtoken[-1])})
        test_lemlength.append(len(test_token))
        test_length.append(len(temp_text.split(" ")))
        test_timelength.append(info_text[j]['duration'])
        test_maxvolume.append(info_text[j]['max_dBFS'])
        test_avevolume.append(info_text[j]['dBFS'])

In [None]:
temp_dic = pd.DataFrame(info_dic).transpose()
temp_dic.head()

In [None]:
plt.clf()
#plt.scatter(test_maxvolume, test_scores)
plt.scatter(np.array(test_maxvolume)/np.array(test_avevolume), test_scores)
#plt.scatter(np.array(test_timelength)/np.array(test_length), test_scores)
plt.show()

In [None]:
test_text[np.argmax(np.array(test_maxvolume)/np.array(test_avevolume))]

In [None]:
df = pd.DataFrame(alldic).transpose()

In [None]:
df.head()

In [None]:
with open(path + 'info_1.pkl', 'rb') as f:
    tempdic = pickle.load(f)

In [None]:
tempdic

In [None]:
df_temp = pd.DataFrame(tempdic)
df_temp.head()