In [1]:
import pandas as pd
from langdetect import detect
import re

In [2]:
import nltk

# Download the lexicon
#nltk.download("vader_lexicon")

# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# SentimentIntensityAnalyzer.polarity_score()function provides the polarity of the text rendering the dictionary format
# of 4 keys neg, neu, pos and compound
# neg, neu, and pos should add to 1
# Compound is overall and is between -1 and 1
# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

# Extract Data

In [3]:
# Read in doge coin news and market data
doge_df = pd.read_csv("../../CryptoNews/doge.csv")

# Transform the Data

## Functions to Transform Data

In [4]:
# Function to remove unwanted characters and to split the articles into lists for each date
def clean_article_list(articles):   
    a = articles.replace("\\n",'')
    return re.split(', \W', a)

### Cleaning Function
Eliminate all articles not in english.
Vader will be more accurate with separate article titles

In [6]:
def clean_data(df):
    # Cute df to just grab "begins_at" and "articles"
    cut_df = df.loc[:, ["begins_at", "articles"]]
    
    # Replace articles columns with the cleaned lists
    cut_df["articles"] = cut_df["articles"].apply(lambda x: clean_article_list(x)) 

    # Separate the 'articles' column so that each row has a single column
    exploded_df = cut_df.explode("articles")
    
    # Make a language column to detect the language of each article
    exploded_df["language"] = exploded_df["articles"].apply(lambda x: detect(x))

    # Make a new df where the articles are just in English
    clean_df = exploded_df.loc[exploded_df["language"]=='en',:]

    return clean_df

### Vader Function

In [10]:
def apply_vader(df):
    # Predict sentiment for each article
    df["vader_prediction"] = df["articles"].apply(lambda text: sent_analyzer.polarity_scores(text)['compound'])

    # Regroup the df by the date
    df_over_time = df.loc[:,['begins_at','vader_prediction']].groupby('begins_at').mean()
    return df_over_time

### Sentiment Function

In [11]:
# Function to gives a polarity rather than numerical output
def format_output(prediction):
  
  polarity = "neutral"

  if(prediction>= 0.05):
    polarity = "positive"

  elif(prediction<= -0.05):
    polarity = "negative"

  return polarity

## Apply Transform Functions to Data

In [7]:
# Clean the data via the clean function 
doge_clean = clean_data(doge_df)
doge_clean

Unnamed: 0,begins_at,articles,language
0,2021-01-02,"['Dogecoin Spikes 120%: If History Repeats, Th...",en
0,2021-01-02,Dogecoin Sees 125% Increase In Trading On Satu...,en
0,2021-01-02,Dogecoin (DOGE) surges over 100% as Bitcoin bu...,en
0,2021-01-02,"Ripple CTO Reveals His Crypto Holdings, XRP an...",en
0,2021-01-02,Here’s Why Analysts Think Ethereum Will Soon M...,en
...,...,...,...
796,2023-03-09,Why Dogecoin Is Getting Hammered By Benzinga',en
796,2023-03-09,Why Dogelon Mars (ELON) Is Up 3% Today',en
796,2023-03-09,The Future Of Crypto: Commit To Locking Your D...,en
796,2023-03-09,New Meme Crypto Big Eyes Coin Excels with its ...,en


In [12]:
# Apply vader function
doge_vader = apply_vader(doge_clean)
doge_vader

Unnamed: 0_level_0,vader_prediction
begins_at,Unnamed: 1_level_1
2021-01-02,0.183320
2021-01-03,0.007371
2021-01-04,0.011000
2021-01-05,0.183663
2021-01-06,0.200137
...,...
2023-03-05,0.082570
2023-03-06,0.104460
2023-03-07,0.056780
2023-03-08,-0.074480


In [13]:
# Apply format function to get polarity
doge_vader["sentiment"] = doge_vader["vader_prediction"].apply(lambda x: format_output(x))
doge_vader

Unnamed: 0_level_0,vader_prediction,sentiment
begins_at,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-02,0.183320,positive
2021-01-03,0.007371,neutral
2021-01-04,0.011000,neutral
2021-01-05,0.183663,positive
2021-01-06,0.200137,positive
...,...,...
2023-03-05,0.082570,positive
2023-03-06,0.104460,positive
2023-03-07,0.056780,positive
2023-03-08,-0.074480,negative


# Load Data