### Read-In Data

In [1]:
import pandas as pd

In [3]:
combined = pd.read_csv('../data/Combined_News_DJIA.csv')
news = pd.read_csv('../data/RedditNews.csv')
market = pd.read_csv('../data/upload_DJIA_table.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/Combined_News_DJIA.csv'

In [None]:
print(combined.shape)
combined.head(1)

In [None]:
# Remove 'b'-prefixes
combined = combined.applymap(lambda cell: cell.strip() if type(cell)==str else cell)
combined = combined.applymap(lambda cell: cell.lstrip('b"') if type(cell)==str else cell)
combined.head(2)

In [None]:
combined.tail(2)

In [None]:
print(news.shape)
news.head()

In [None]:
print(market.shape)
market.head()

### Tokenizer with Vanilla Python

In [None]:
news2 = news.copy()

In [None]:
import re

def tokenize(text):
    """Parses a string into a list of semantic units (words)

    Args:
        text (str): The string that the function will tokenize.

    Returns:
        list: tokens parsed out by the mechanics of your choice
    """
    
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = tokens.lower().split()
    
    return tokens

In [None]:
from collections import Counter

news2['tokens'] = news2['News'].apply(tokenize)

word_counts = Counter()

news2['tokens'].apply(lambda x: word_counts.update(x))

word_counts.most_common(10)

In [None]:
# ^ These are stop words.

## Tokenization with spaCy

In [None]:
# Sample Corpus from Combined DF
# -- Includes Only Top1 Headlines
corpus = combined[['Date', 'Label', 'Top1']]
print(corpus.shape)
corpus.head(3)

In [None]:
import spacy

# Load Neural Network
nlp = spacy.load("en_core_web_md")

In [None]:
doc_0 = nlp(corpus['Top1'][0])
doc_0

In [None]:
for token in doc_0:
    print(token.text)

In [None]:
import random

# spaCy Default Stop-Words
default_stop_words = nlp.Defaults.stop_words
print("Total Default Stop-Words:\n", len(default_stop_words))
# Check Out Some spaCy Default Stop-Words
print("7 Random Stop-Words")
set(random.sample(default_stop_words, 7))

## Removing Stop-Words with spaCy

In [None]:
corpus.shape

#### Pipe Functions -> Let Us Loop Through Multiple Documnets

In [None]:
tokens = []

""" Update Tokens w/o Stop-Words """
# for doc in nlp.pipe(iterable_of_docs, batch_size=500):
for doc in nlp.pipe(corpus['Top1'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False) & (token.is_space == False):
            doc_tokens.append(token.text.lower())
        
    tokens.append(doc_tokens)
    
# Create New Tokens Column in Corpus DF     
corpus.insert(3, 'tokens', tokens)

In [None]:
# Get Verb Tokens

verb_tokens = []

""" Update Tokens w/o Stop-Words """
for doc in nlp.pipe(corpus['Top1'], batch_size=500):
    doc_tokens = []
    for token in doc:       
        if (token.pos_ == 'VERB'):
            doc_tokens.append(token.text.lower())
        
    verb_tokens.append(doc_tokens)
    
# Create New Tokens Column in Corpus DF 
corpus.insert(4, 'verb_tokens', verb_tokens)

In [None]:
corpus['tokens'].head()

In [None]:
corpus['verb_tokens'].head()

In [None]:
# How we Aggregate All Tokens
import itertools
aggregate_tokens = list(itertools.chain.from_iterable(corpus['tokens']))
aggregate_tokens[-5:]

In [None]:
# Aggregate Tokens of Corpus
print("Total Aggregate Tokens:", len(aggregate_tokens))

# Top 10 Non-Stop-Words
# WC - Stands for Word Count
word_freq = Counter(aggregate_tokens)
top_10 = [tup[0] for tup in word_freq.most_common(10)]
# This list can be used to identify domain-specific stop words.
top_10

In [None]:
# Aggregate Verb-Tokens of Corpus
import itertools
aggregate_verb_tokens = list(itertools.chain.from_iterable(corpus['verb_tokens']))
print("Total Aggregate Verb Tokens:", len(aggregate_verb_tokens))
# Top 10 Verbs
# WC - Stands for Word Count
verb_freq = Counter(aggregate_verb_tokens)
top_10 = [tup[0] for tup in verb_freq.most_common(10)]
# This list can be used to identify domain-specific stop words.
top_10

### Statistical Trimming -> Common Approach to Stop-Word Removal
#### The Idea is Such:
1. The words that appear most frequently may not provide any insight into the meaning of the document since they are so prevelant.
2. Words that appear infrequently also probably do not add much value, because they are mentioned so rarely

## Stemming & Lemmatization
#### This is a form of normalization.
Recognizing that killed == kill and batteries == battery in the right context.

These words share the same **root** words

#### Stemming: No-Longer Reccomended for Normalization
stemming = a process for removing the commoner morpholical and inflexional endings from words in English (Think: ing, ed, s, ies)
- The process of stemming tokens is usually quick because it is **rule** based.
- Most stemming is done by well documented algorithms such as Porter, Snowball, and Dawson
- Semming might still work well in applications where humans don't have to worry about reading the results. 
- Search enginenes and more broadly, information retrieval algorithms use stemming because it's so fast. 

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = ['wolf', 'wolves']

# Stemming is just going to chop off the ends of words and
# sometimes create non-words
print("Example of Stemming Limitations")
for word in words:
    print(ps.stem(word))

print("\nExample of When Stemming Works")
better_example_words = ['love', 'loves']
for word in better_example_words:
    print(ps.stem(word))

### Lemmatization: Higher Computational Cost

- More methodical than stemming.
- The goal is to transform a word into its base form called a **lemma**.
- Plural nouns with funky spellings get transformed to singular tense, verbs are all transformed to the transitive.

In [None]:
# Small Example of Lemmatization
sent = "This is the start of our NLP adventures. We started here with spaCy. We are starting her with NLP."

nlp = spacy.load("en_core_web_lg")

doc = nlp(sent)

# Lemma Attributes
for token in doc:
    print(token.text, " - ", token.lemma_)

In [None]:
# Wrap it in a function

def get_lemmas(text):
    
    lemmas = []
    
    doc = nlp(text)
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False) & (token.is_space == False) & (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
            
    return lemmas

In [None]:
lemmas = corpus['Top1'].apply(get_lemmas)

# Create New Lemmas Column in Corpus DF 
corpus.insert(5, 'lemmas', lemmas)

In [None]:
corpus.head()

In [None]:
corpus.tail()

## What are the words accociated with a positive label?

Scatter plot of top words.

In [None]:
corpus_a = corpus[['lemmas', 'Label']].copy()

In [None]:
corpus_a.head()

In [None]:
# This is a lot, let's break it up. --See Section Below
df2 = corpus_a['lemmas'].apply(pd.Series) \
    .merge(corpus_a, right_index=True, left_index = True) \
    .drop('lemmas', axis=1) \
    .melt(id_vars = ['Label'], value_name = 'lemma')
df2.head()

In [None]:
# Drop Rows that have ANY missing data
df2 = df2.dropna(axis=0)
print(df2.shape)
df2.head()

In [None]:
df2.tail()

In [None]:
df2 = df2.groupby('lemma').mean()

In [None]:
df2.describe()

In [None]:
df2

### Breaking Down the Above Data Organization

In [None]:
# OK, we start off with corpus_a
corpus_a.head()

In [None]:
# Then we make a Series out of each Row/Cell in the Lemmas Column
seriesed = corpus_a['lemmas'].apply(pd.Series)
# Each 'Cell' is Broken into an Entire Row
seriesed.head()

In [None]:
# Now we Merge these Rows of Lemmas BACK with Corups on their indexes.
# So essentially we concatenate columns...
merged = seriesed.merge(corpus_a, right_index=True, left_index = True)
merged.head()

In [None]:
# Then we drop the compressed Lemmas column sice we don't
# need that information twice
dropped = merged.drop('lemmas', axis=1)
dropped.head()

In [None]:
# Where did variable come from?
melted = dropped.melt(id_vars = ['Label'], value_name = 'lemma')
melted.head()

In [None]:
melted.tail()

In [None]:
# There are values 0-34 for the variable column...
# this corresponds to the column integer names for every lemma.
melted['variable'].nunique()

In [None]:
# Drop Rows that have ANY missing data
dropped_nans = melted.dropna(axis=0)
print(dropped_nans.shape)
dropped_nans.head()

In [None]:
# Then we Group Each Lemma by It's Average Label
grouped = dropped_nans.groupby('lemma').mean()
grouped

## The Analysis Continues

In [None]:
# Let's look at these means with their count in mind...
dropped_nans.groupby('lemma').count().head()

In [None]:
# Obtain Series of Lemma Count
count_series = dropped_nans.groupby('lemma').count()['Label']

In [None]:
# Combine Count and Label Average Columns
lemma_data = grouped.merge(count_series, right_index=True, left_index = True)
lemma_data.columns = ['label_avg', 'lemma_count']
lemma_data = lemma_data.reset_index()

In [None]:
lemma_data.head()

In [None]:
# I'm thinking a lot of the 0 and 1 averages are less interesting because
# they come from one instance...
lemma_data.sort_values('label_avg', ascending=False)

In [None]:
# ... Let's test this assumption.
count_one_to_one = [False] * len(lemma_data)

# I now realize this is a bit redundant ----
for i, row in enumerate(lemma_data.iterrows()):
    # if lemma_count is 1
    if row[1][2] == 1: # ilemma_count
        count_one_to_one[i] = True

In [None]:
# Over Half of all Lemmas Fit In This 1-1 Category.
sum(count_one_to_one)/len(lemma_data)

In [None]:
# Let's keep only the lemmas that have more than 
keepers = ~pd.Series(count_one_to_one)
lemma_data_reduced = lemma_data[keepers]

In [None]:
# Now this seems to tell us more about what each lemma might mean
# for daily market performance.
lemma_data_reduced.sort_values('label_avg', ascending=False)

In [None]:
perf_one_avg = []

for avg in lemma_data_reduced['label_avg']:
    if avg == 1.0:
        perf_one_avg.append(True)
    else:
        perf_one_avg.append(False)
        
print("Lemmas that Occur Once:", sum(perf_one_avg))   
print("Total Lemmas:", len(perf_one_avg))   

In [None]:
assert len(lemma_data_reduced) == len(perf_one_avg)

In [None]:
lemma_data_reduced.reset_index().drop('index', axis=1)

In [None]:
# We Need to Reset the Index on lemma_data_reduced since...
# it starts at 1 for some reason...
lemma_data_reduced = lemma_data_reduced.reset_index().drop('index', axis=1)
interesting_lemmas = lemma_data_reduced[~pd.Series(perf_one_avg)]

In [None]:
interesting_lemmas.head()

In [None]:
# The Half of all Lemma Counts fall between 2-8
# To normalize the comparison between lemmas I will examine
# only Lemmas with such a count.
interesting_lemmas.describe()

In [None]:
# Examine Histogram without skewed right tail to visualize
# distribution of Lemma Counts
less_skew = interesting_lemmas[interesting_lemmas['lemma_count'] < 10]
less_skew.hist('lemma_count', bins=20)

In [None]:
# IQR Comparison
mask = (interesting_lemmas['lemma_count'] >= 2) & \
(interesting_lemmas['lemma_count'] < 8)
interesting_lemmas_iqr = interesting_lemmas[mask]

In [None]:
interesting_lemmas_iqr.sort_values('label_avg', ascending=False)

In [None]:
interesting_lemmas_iqr.plot.scatter('label_avg', 'lemma_count')

In [None]:
# 4th Quartile Range Lemmas by Count
mask = (interesting_lemmas['lemma_count'] > 8)
Q4_lemmas = interesting_lemmas[mask]
Q4_lemmas.sort_values('label_avg', ascending=False)

In [None]:
Q4_lemmas.plot.scatter('lemma_count', 'label_avg')

In [None]:
top_iqr = interesting_lemmas_iqr.sort_values('label_avg', ascending=False)['lemma'].head(10)
top_iqr

In [None]:
bottom_iqr = interesting_lemmas_iqr.sort_values('label_avg', ascending=False)['lemma'].tail(10)
bottom_iqr

In [None]:
top_Q4 = Q4_lemmas.sort_values('label_avg', ascending=False)['lemma'].head(10)
top_Q4

In [None]:
bottom_Q4 = Q4_lemmas.sort_values('label_avg', ascending=False)['lemma'].tail(10)
bottom_Q4