In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Python support modules
import re
import string
import datetime
import pickle 
from collections import Counter

# Spacy
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_sm
from spacy.pipeline import SentenceSegmenter

# NLTK
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords, wordnet

# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Helper functions in py file
from preprocessing_headlines import cleaned_headline
import importlib

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Tara8082/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
## Practice notebook


data = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_headlines.csv')
pd.set_option('max_colwidth', 100)

# Preprocessing date column, converting to date time, dropping old date column

# data.date=data.date.str[:10]
# data['date'] = data['date'].str.replace('-','/')
# data['datetime'] = pd.to_datetime(data['date'], format="%Y/%m/%d")
# data.drop('date', axis=1, inplace=True)  
# data.head()

headlines = data[['content']].astype(str)

In [43]:
%%time
headlines['clean_content'] = headlines['content'].map(cleaned_headline).astype(str)

# if word is not proper noun, lower case it

CPU times: user 33.1 s, sys: 643 ms, total: 33.7 s
Wall time: 36.6 s


In [44]:
headlines.head()

Unnamed: 0,content,clean_content
0,"Biden leads Trump among Hispanic voters, 62% to 29%, a WSJ/NBC/Telemundo poll shows https://t.co...",Biden leads Trump among Hispanic voters to a WSJ NBC Telemundo poll shows
1,"“We can’t keep up with the laundry.” Covid-19 has turned the tourism industry upside down, creat...",We can t keep up with the laundry Covid has turned the tourism industry upside down creating ...
2,A large English study showed the number of people with Covid-19 antibodies declined significantl...,A large English study showed the number of people with Covid antibodies declined significantly o...
3,"The leaders of Microsoft, Coca-Cola, American Airlines and other companies discuss how business ...",The leaders of Microsoft Coca Cola American Airlines and other companies discuss how business is...
4,"After seven months of isolation, the pull of getting together is strong. But with Covid-19 hospi...",After seven months of isolation the pull of getting together is strong But with Covid hospitaliz...


In [45]:
standard_stop_words = stopwords.words("english")

# Open list of Twitter stop words

with open('stopwords.txt') as fp:
    # 1. iterate over file line-by-line
    # 2. strip line of newline symbols
    # 3. split line by spaces into list (of number strings)
    # 4. convert to string
    # 5. convert map object to list
    stopwords_data = [list(map(str, line.strip().split(' '))) for line in fp]

# Combining standard list and imported list of stop words

flat_stopwords_data = [item for sublist in stopwords_data for item in sublist]
stopwords = standard_stop_words + flat_stopwords_data
stopwords_set = set(stopwords) 

#adding 'breaking' to list of stop words
stopwords_set.update(['breaking', 'report', 'seven', 'people', 'happening', 'now', 'wsjbooks', 'wsj', 'wsjopinion', 'zyahna', 'zyairr', 'zydeco', 'zymere', 'zymergen', 'zynga', 'zz', 'wsj', 'nbc', 'zziya', 'zzz', 'zzzs', 'zwaan', 'zwack', 'zwang', 'zweibel', 'zweli', 'zwetsloot', 'zwickau','zwift', 'zwillinger', 'zwoolfe', 'zurfi', 'zuri', 'zury', 'zushaelinson', 'zutors','zuurbekom', 'zuzana', 'zverev', 'zvyagintsev', 'morning', 'brief', 'briefing', 'page', 'story', 'rundown', 'update', 'even', 'day', 'evening', 'publish', 'edition', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'])

#stopwords_set included standard stop words, twitter words, and common words to media accounts
print("There are {} stop words in this list.".format(len(stopwords_set)))



There are 541 stop words in this list.


In [46]:
disabled_components = ['tagger', 'parser']

nlp = spacy.load('en', disable=disabled_components)  # need to fix english model, need smaller english model
nlp.add_pipe(nlp.create_pipe('sentencizer'))

print(nlp.pipe_names)

['ner', 'sentencizer']


## SPACY PIPELINE

In [47]:
def lemmatize_nlp_pipe(doc):
    '''
    function that normalizes the text (lemmatization) if text is alpha and not in list of stopwords
    '''
    lemma_list = [str(token.lemma_) for token in doc
                 if token.is_alpha and token.text not in stopwords_set]
    lem_string = " ".join(lemma_list)
    return lem_string

In [56]:
def entity(doc):
    if doc.ents:
        for entity in doc.ents:
            print(entity.text, entity.start_char, entity.end_char, entity.label_)
    else:
        print("Not found")

In [57]:
def preprocessed_pipe(texts, batch_size=100):
    preprocessed_pipe = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        if doc.ents:
            preprocessed_pipe.append(lemmatize_nlp_pipe(entity(doc)))
        #else:
            #preprocessed_pipe.append(lemmatize_nlp_pipe(entity(doc)))
    return preprocessed_pipe

In [58]:
headlines['test'] = preprocessed_pipe(headlines['clean_content'], batch_size=10000)

Hispanic 24 32 NORP
WSJ 45 48 ORG
NBC Telemundo 49 62 ORG


TypeError: 'NoneType' object is not iterable

In [None]:
enti