# Unit 12 NLP Assigment
# Section One: 
## Using the newsapi for BTC and ETH to create Sentiment Analysis (SA) for each coin

# Initial imports

In [131]:

import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
import nltk as nltk
from newsapi import NewsApiClient
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [132]:
# Read your api key environment variable
api_key = os.getenv("news_api")

In [133]:
# Create a newsapi client
newsapi = NewsApiClient(api_key=api_key)

In [134]:
# Fetch the top Bitcoin news articles
bitcoin_articles = newsapi.get_everything(q="bitcoin", language="en")
bitcoin_articles

#

{'status': 'ok',
 'totalResults': 7177,
 'articles': [{'source': {'id': 'wired', 'name': 'Wired'},
   'author': 'Paul Ford',
   'title': 'A Field Guide for Nature-Resistant Nerds',
   'description': 'Yes, yes, the dirt is horrifying. But it’s also how we make bitcoin apps.',
   'url': 'https://www.wired.com/story/a-field-guide-for-nature-resistant-nerds-microchips-climate-change/',
   'urlToImage': 'https://media.wired.com/photos/61086c497b8c62be3062fe82/191:100/w_1280,c_limit/WI090121_MG_Ford_01.jpg',
   'publishedAt': '2021-08-06T11:00:00Z',
   'content': 'When my wife started a little garden in our urban backyard, all I could think about were the worms. Also the bugs, and the dirt, which is of course filled with worms and bugs and composted corn cobs.… [+3499 chars]'},
  {'source': {'id': 'wired', 'name': 'Wired'},
   'author': 'Khari Johnson',
   'title': 'Why Not Use Self-Driving Cars as Supercomputers?',
   'description': 'Autonomous vehicles use the equivalent of 200 laptops to 

In [135]:
# Total bitcoin article results
print(f"Bitcoin article total: {bitcoin_articles['totalResults']}")

#Show sample bitcoin article - 3rd result
bitcoin_articles['articles'][2]

Bitcoin article total: 7177


{'source': {'id': 'techcrunch', 'name': 'TechCrunch'},
 'author': 'Richard Dal Porto',
 'title': 'Daily Crunch: Bitcoin ‘is a big part of our future,’ says Twitter CEO Jack Dorsey',
 'description': 'Hello friends and welcome to Daily Crunch, bringing you the most important startup, tech and venture capital news in a single package.',
 'url': 'http://techcrunch.com/2021/07/23/daily-crunch-bitcoin-is-a-big-part-of-our-future-says-twitter-ceo-jack-dorsey/',
 'urlToImage': 'https://techcrunch.com/wp-content/uploads/2020/10/GettyImages-1216921783.jpg?w=576',
 'publishedAt': '2021-07-23T22:10:45Z',
 'content': 'To get a roundup of TechCrunchs biggest and most important stories delivered to your inbox every day at 3 p.m. PDT, subscribe here.\r\nHello and welcome to Daily Crunch for July 23, 2021. Its been an i… [+5456 chars]'}

In [136]:
# Fetch the top Ethereum news articles
ethereum_articles = newsapi.get_everything(q="ethereum", language="en")
ethereum_articles

{'status': 'ok',
 'totalResults': 2423,
 'articles': [{'source': {'id': 'techcrunch', 'name': 'TechCrunch'},
   'author': 'Connie Loizos',
   'title': 'Crypto investors like Terraform Labs so much, they’re committing $150 million to its ‘ecosystem’',
   'description': 'There are many blockchain platforms competing for investors’ and developers’ attention right now, from the big daddy of them all, Ethereum, to so-called “Ethereum Killers” like Solana, which we wrote about in May. Often, these technologies are seen as so prom…',
   'url': 'http://techcrunch.com/2021/07/16/crypto-investors-like-terraform-labs-so-much-theyre-committing-150-million-to-its-ecosystem/',
   'urlToImage': 'https://techcrunch.com/wp-content/uploads/2020/06/GettyImages-1174590894.jpg?w=667',
   'publishedAt': '2021-07-16T16:00:55Z',
   'content': 'There are many blockchain platforms competing for investors’ and developers’ attention right now, from the big daddy of them all, Ethereum, to so-called “Ethereum Kille

In [137]:
# Total ethereum article results
print(f"Ethereum article total: {ethereum_articles['totalResults']}")

#Show sample ethereum article - 5th result
ethereum_articles['articles'][4]

Ethereum article total: 2423


{'source': {'id': 'reuters', 'name': 'Reuters'},
 'author': 'Reuters',
 'title': 'Ethereum major upgrade activated; ether stays lower - Reuters',
 'description': 'A major software upgrade on Ethereum, the second-largest blockchain network, was activated on Thursday, a move seen as stabilizing transaction fees on the network and subsequently reducing the supply of the ether token.',
 'url': 'https://www.reuters.com/technology/ethereum-major-upgrade-activated-ether-stays-lower-2021-08-05/',
 'urlToImage': 'https://www.reuters.com/resizer/0BxXzmVhK1PkIw75bBwMHxbHzvY=/1200x628/smart/filters:quality(80)/cloudfront-us-east-2.images.arcpublishing.com/reuters/NENHSOKGOJMQVFJQB2ZPYW4D4E.jpg',
 'publishedAt': '2021-08-05T12:48:00Z',
 'content': 'Representation of the Ethereum virtual currency standing on the PC motherboard is seen in this illustration picture, February 3, 2018. REUTERS/Dado Ruvic/IllustrationNEW YORK, Aug 5 (Reuters) - A maj… [+439 chars]'}

In [138]:
# Create the Bitcoin sentiment scores DataFrame
bitcoin_sentiment_df = []

for article in bitcoin_articles["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        bitcoin_sentiment_df.append({
            "text": text,
            "date": date,
            "compound": compound, 
            "positive": pos, 
            "negative": neg,
            "neutral": neu

        })
    except AttributeError:
        pass

# Create Dataframe
bitcoin_df = pd.DataFrame(bitcoin_sentiment_df)

# Arrange columns so that the date is first
bitcoin_df = bitcoin_df.set_index("date")
bitcoin_df.head()


Unnamed: 0_level_0,text,compound,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-08-06,When my wife started a little garden in our ur...,-0.34,0.0,0.061,0.939
2021-07-19,"Like Dogecoin devotees, the mayor of Reno, and...",0.6908,0.178,0.0,0.822
2021-07-23,To get a roundup of TechCrunchs biggest and mo...,0.624,0.127,0.0,0.873
2021-07-14,While retail investors grew more comfortable b...,0.7264,0.164,0.0,0.836
2021-07-21,"As longtime TechCrunch readers know well, Mich...",0.4939,0.127,0.0,0.873


In [139]:
# Create the Ethereum sentiment scores DataFrame
ethereum_sentiment_df = []

for article in ethereum_articles["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]

        ethereum_sentiment_df.append({
            "text": text,
            "date": date,
            "compound": compound, 
            "positive": pos, 
            "negative": neg,
            "neutral": neu

        })
    except AttributeError:
        pass

# Create Dataframe
ethereum_df = pd.DataFrame(ethereum_sentiment_df)

# Arrange columns so that the date is first
ethereum_df = ethereum_df.set_index("date")
ethereum_df.head()


Unnamed: 0_level_0,text,compound,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-16,There are many blockchain platforms competing ...,0.3612,0.075,0.0,0.925
2021-07-29,Blockchain infrastructure startups are heating...,-0.2411,0.0,0.061,0.939
2021-07-14,While retail investors grew more comfortable b...,0.7264,0.164,0.0,0.836
2021-08-05,Cent was founded in 2017 as an ad-free creator...,0.6956,0.19,0.0,0.81
2021-08-05,Representation of the Ethereum virtual currenc...,0.0,0.0,0.0,1.0


In [140]:
# Describe the Bitcoin Sentiment
bitcoin_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,20.0,20.0,20.0,20.0
mean,0.059435,0.0609,0.04645,0.89265
std,0.471962,0.060969,0.083549,0.093352
min,-0.8271,0.0,0.0,0.653
25%,-0.307,0.0,0.0,0.85325
50%,0.0,0.056,0.0,0.9105
75%,0.467575,0.1,0.07625,0.95425
max,0.7264,0.178,0.287,1.0


In [141]:
# Describe the Ethereum Sentiment
ethereum_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,20.0,20.0,20.0,20.0
mean,0.174515,0.05605,0.01345,0.9305
std,0.312339,0.073898,0.02766,0.08117
min,-0.34,0.0,0.0,0.782
25%,0.0,0.0,0.0,0.84275
50%,0.0,0.0,0.0,0.9695
75%,0.402575,0.1295,0.0,1.0
max,0.7264,0.19,0.072,1.0


# Section Two: Natural Language Processing

## Tokenize the text for BTC using NLTK and Python

In [142]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [143]:
# Code to download corpora
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [144]:
# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a list of stopwords
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [145]:
# We can define our own list of stopwords to add to the default nltk words
sw_addon = {'http', 'https', 'www','.com'}



In [146]:
#Show sample bitcoin article - 3rd result
bitcoin_articles['articles'][2]

{'source': {'id': 'techcrunch', 'name': 'TechCrunch'},
 'author': 'Richard Dal Porto',
 'title': 'Daily Crunch: Bitcoin ‘is a big part of our future,’ says Twitter CEO Jack Dorsey',
 'description': 'Hello friends and welcome to Daily Crunch, bringing you the most important startup, tech and venture capital news in a single package.',
 'url': 'http://techcrunch.com/2021/07/23/daily-crunch-bitcoin-is-a-big-part-of-our-future-says-twitter-ceo-jack-dorsey/',
 'urlToImage': 'https://techcrunch.com/wp-content/uploads/2020/10/GettyImages-1216921783.jpg?w=576',
 'publishedAt': '2021-07-23T22:10:45Z',
 'content': 'To get a roundup of TechCrunchs biggest and most important stories delivered to your inbox every day at 3 p.m. PDT, subscribe here.\r\nHello and welcome to Daily Crunch for July 23, 2021. Its been an i… [+5456 chars]'}

In [147]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""

# Convert the words to lowercase
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")

# Remove the punctuation from text
    re_clean = regex.sub('', text)

# Create a tokenized list of the words
    words = word_tokenize(re_clean)

# Lemmatize words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word.lower() for word in lem if word.lower() not in sw]
    return tokens
    

In [148]:
print(set(tokenizer(text))) 

{'bitcoin', 'tslao', 'february', 'virtual', 'front', 'char', 'reutersdado', 'ruvicjuly', 'representations', 'inc', 'tesla', 'seen', 'currency', 'logo', 'w', 'illustration', 'electriccar', 'taken', 'maker', 'reuters'}


In [171]:
# Create a new tokens column for Bitcoin
bitcoin_df["tokens"] = bitcoin_df.text.apply(tokenizer)
bitcoin_df.head()


Unnamed: 0_level_0,text,compound,positive,negative,neutral,tokens
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-06,When my wife started a little garden in our ur...,-0.34,0.0,0.061,0.939,"[wife, started, little, garden, urban, backyar..."
2021-07-19,"Like Dogecoin devotees, the mayor of Reno, and...",0.6908,0.178,0.0,0.822,"[like, dogecoin, devotee, mayor, reno, leader,..."
2021-07-23,To get a roundup of TechCrunchs biggest and mo...,0.624,0.127,0.0,0.873,"[get, roundup, techcrunchs, biggest, important..."
2021-07-14,While retail investors grew more comfortable b...,0.7264,0.164,0.0,0.836,"[retail, investor, grew, comfortable, buying, ..."
2021-07-21,"As longtime TechCrunch readers know well, Mich...",0.4939,0.127,0.0,0.873,"[longtime, techcrunch, reader, know, well, mic..."


In [187]:
# Create a new tokens column for Ethereum
ethereum_df["tokens"] = ethereum_df.text.apply(tokenizer)
ethereum_df.head()

Unnamed: 0_level_0,text,compound,positive,negative,neutral,tokens
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-16,There are many blockchain platforms competing ...,0.3612,0.075,0.0,0.925,"[many, blockchain, platform, competing, invest..."
2021-07-29,Blockchain infrastructure startups are heating...,-0.2411,0.0,0.061,0.939,"[blockchain, infrastructure, startup, heating,..."
2021-07-14,While retail investors grew more comfortable b...,0.7264,0.164,0.0,0.836,"[retail, investor, grew, comfortable, buying, ..."
2021-08-05,Cent was founded in 2017 as an ad-free creator...,0.6956,0.19,0.0,0.81,"[cent, wa, founded, adfree, creator, network, ..."
2021-08-05,Representation of the Ethereum virtual currenc...,0.0,0.0,0.0,1.0,"[representation, ethereum, virtual, currency, ..."


## Using ngrams and word frequency for each coin. 
Use NLTK to produce the n-grams for N = 2. 
List the top 10 words for each coin.

In [None]:
 from collections import Counter
from nltk import ngrams

In [None]:
# Generate the Bitcoin N-grams where N=2


In [None]:
# Generate the Ethereum N-grams where N=2


In [None]:
# Function token_count generates the top 10 words for a given coin
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [None]:
# Use token_count to get the top 10 words for Bitcoin


In [None]:
# Use token_count to get the top 10 words for Ethereum


## Using Word Clouds to summarize the news for each coin

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [None]:
# Generate the Bitcoin word cloud


In [None]:
# Generate the Ethereum word cloud


# Section Three: Named Entity Recognition
## building a named entity recognition model for BTC & ETH
## visualizing the tags using SpaCy

In [None]:
import spacy
from spacy import displacy

In [None]:
# Download the language model for SpaCy
# !python -m spacy download en_core_web_sm

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

## BTC NER

In [None]:
# Concatenate all of the Bitcoin text together


In [None]:
# Run the NER processor on all of the text


In [None]:
# Add a title to the document


In [None]:
# Render the visualization


In [None]:
# List all Entities


## ETH NER

In [None]:
# Concatenate all of the Ethereum text together


In [None]:
# Run the NER processor on all of the text


In [None]:
# Render the visualization


In [None]:
# List all Entities
