In [4]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim, string
from nltk.corpus import stopwords
from collections import Counter

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from tqdm import tqdm

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    
    table = str.maketrans('', '', string.punctuation)
    # get rid of punctuation
    text = text.translate(table)
    
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

def calc_score(ref, comp, debug=False):
    '''gives the number of items in ref that is also found in comp'''
    ## check if it is a list of strings
    if not isinstance(ref, list):
        ref = str(ref).split()
    if not isinstance(comp, list):
        comp = str(comp).split()
        
    s_ref = set(ref)
    s_comp = set(comp)
    s_inter = s_comp.intersection(s_ref)
    if debug:
        print(s_ref)
        print(s_comp)
        print(s_inter)
    return len(s_inter)/len(s_ref)

In [9]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve", "\n", "", " ", "\n\n", "npr"]

def lemming(data, keeptype=["NOUN", "PROPN", "NUM", "ADJ", "ADV"], doalpha=True):
    tokens = []
    for tok in data:
        
        # stoplist the tokens
        if tok not in STOPLIST:
            pass
        else:
            continue
        
        # stoplist symbols
        if tok not in SYMBOLS:
            pass
        else: 
            continue
        
        ##check if the token is alpha
        if doalpha:
            if tok.is_alpha:
                pass
            else:
                continue
        else:
            pass
        
        
        ##check if the token is stopword
        if not tok.is_stop:
            pass
        else:
            continue
    
        ##check if the token is noun
        if len(keeptype) > 1:
            if tok.pos_ in keeptype:
                pass
            else:
                continue
        else:
            pass
    
        # lemmatize
        if tok.lemma_ != "-PRON-" :
            tokens.append(tok.lemma_.lower().strip())
        else:
            tokens.append(tok.lower_)
    
    # remove large strings of whitespace
    return tokens

In [7]:
# Initialize spacy 'en' model
nlp = spacy.load('en')

# Initialize vectorizer
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=1,                        # minimum reqd occurences of a word 
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

In [None]:
all_token = []
all_token_alltype = []
all_sumtoken = []
N = 300
for i in tqdm(range(N)):
    test_text = []
    test_sumtext   = []
    with open('Data/NPR/' + str(i) + '_trans.txt', 'r') as myfile:
        test_text.append(cleanText(myfile.read()))
    with open('Data/NPR/' + str(i) + '.txt', 'r') as myfile:
        test_sumtext.append(cleanText(myfile.read()))
    ## calculate the maximum score
    nlp_test_text = nlp("".join(test_text))
    all_token.append(lemming(nlp_test_text))
    all_token_alltype.append(lemming(nlp_test_text, doalpha=False, keeptype=[]))
    all_sumtoken.append(lemming(nlp("".join(test_sumtext)), doalpha=False, keeptype=[]))

In [8]:
base_rate = []
my_rate  = []
for i in tqdm(range(N)):
    ## calculate the maximum score
    test_token = all_token[i]
    test_token_alltype = all_token_alltype[i]
    test_sumtoken = all_sumtoken[i]
    best_score = calc_score(ref=test_sumtoken, comp=test_token, debug=False)
    #print("best score", best_score)

    length = len(test_sumtoken)
    common_words_alltype = [w[0] for w in Counter(test_token_alltype).most_common(length)]
    #print(" ".join(common_words))
    base_score =  calc_score(ref=test_sumtoken, comp=common_words_alltype, debug=False)
    #print("baseline score", base_score)

#     # Predict the topic
#     topic = predict_topic(text = [" ".join(test_token)], nwords=length)
#     #print(" ".join(topic))

    topic = [w[0] for w in Counter(test_token).most_common(length)]
    my_score = calc_score(ref=test_sumtoken, comp=topic, debug=False)
    #print("my score", my_score)
    #print(" ".join(test_sumtoken))
    base_rate.append(base_score/best_score)
    my_rate.append(my_score/best_score)

NameError: name 'N' is not defined