# Import dependencies

In [2]:
import pandas as pd
import sqlite3
from sqlite3 import Error

### NTLK

In [2]:
import nltk

# Download the lexicon
#nltk.download("vader_lexicon")

# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# SentimentIntensityAnalyzer.polarity_score()function provides the polarity of the text rendering the dictionary format
# of 4 keys neg, neu, pos and compound
# neg, neu, and pos should add to 1
# Compound is overall and is between -1 and 1
# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

In [4]:
# Example! 
sentence = "VADER is pretty good at identifying the underlying sentiment of a text!"
print(sent_analyzer.polarity_scores(sentence))

{'neg': 0.0, 'neu': 0.585, 'pos': 0.415, 'compound': 0.75}


# Load

In [3]:
# Create a connection to the SQLite database
database = "market_data.db"
conn = sqlite3.connect(database)

# Check if the connection is successful
if conn is not None:
    # Read the data from the SQLite database into a Pandas DataFrame
    tweets_1 = pd.read_sql_query("SELECT * FROM tweets_musk_2021_2022", conn)    
    tweets_2 = pd.read_sql_query("SELECT * FROM tweets_musk_2023", conn)    

    # Close the connection to the SQLite database
    conn.close()

else:
    print("Error! Cannot create the database connection.")

tweets_1.head()

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1.47666e+18,1.47662e+18,2021-12-31 01:11:23 Arabian Standard Time,12/31/2021,1:11:23,400,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'screen_name': 'roshanpateI', 'name': 'Rosha...",,,,
1,1.47666e+18,1.47664e+18,2021-12-31 00:47:53 Arabian Standard Time,12/31/2021,0:47:53,400,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'screen_name': 'tesla_raj', 'name': 'Tesla R...",,,,
2,1.47665e+18,1.47625e+18,2021-12-31 00:28:51 Arabian Standard Time,12/31/2021,0:28:51,400,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'screen_name': 'CSmithson80', 'name': 'Chris...",,,,
3,1.47662e+18,1.47625e+18,2021-12-30 22:23:14 Arabian Standard Time,12/30/2021,22:23:14,400,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'screen_name': 'BLKMDL3', 'name': 'Zack', 'i...",,,,
4,1.47662e+18,1.47625e+18,2021-12-30 22:15:45 Arabian Standard Time,12/30/2021,22:15:45,400,44196397,elonmusk,Elon Musk,,...,,,,,,"[{'screen_name': 'mims', 'name': 'Christopher ...",,,,


# Clean up!

In [4]:
# cut df to just be the "date" and "text" columns
df_2223_cut = tweets_2.loc[:, ["date", "text"]]
print("The number of tweets in this dataset: {}".format(len(df_2223_cut)))


The number of tweets in this dataset: 4154


In [5]:
# Cast date to be datetime and just take the date (leave out timestamp)
df_2223_cut = df_2223_cut.astype({"date": "datetime64"})

df_2223_cut['date'] = df_2223_cut['date'].apply( lambda x: x.date() )

In [6]:
# Make a copy of the df and rename it to be more clear
# Rename text to tweet
df_2023 = df_2223_cut.copy()
df_2023.rename(columns={"text": "tweet"}, inplace=True)

###  2021-2022

In [7]:
# Grab data from 2021
# Tweets from 2022 dataset end in March 2022 so will not overlap with 2023 dataset which begins in July 2022
df_2021_cut = tweets_1.loc[:, ["date", "tweet"]]
df_2021_cut

Unnamed: 0,date,tweet
0,12/31/2021,@roshanpateI 🤣 $7
1,12/31/2021,@tesla_raj Many UI improvements coming
2,12/31/2021,@CSmithson80 @heydave7 @BLKMDL3 @mims This cha...
3,12/30/2021,@BLKMDL3 @mims Predicting macroeconomics is ch...
4,12/30/2021,"@mims If history is any guide, not many will m..."
...,...,...
4138,1/3/2022,https://t.co/LA9hPzVlGx
4139,1/2/2022,Let’s make the roaring 20’s happen!
4140,1/2/2022,Great work by Tesla team worldwide!
4141,1/1/2022,@BLKMDL3 @Tesla 🔥


In [8]:
# Cast date to be datetime and just take the date (leave out timestamp)
df_2021_cut = df_2021_cut.astype({"date": "datetime64"})
df_2021_cut['date'] = df_2021_cut['date'].apply( lambda x: x.date() )

# Transform

### Concat

In [9]:
# List the dfs to concat
dfs = [df_2021_cut, df_2023]
df_EMusk = pd.concat(dfs)
df_EMusk

Unnamed: 0,date,tweet
0,2021-12-31,@roshanpateI 🤣 $7
1,2021-12-31,@tesla_raj Many UI improvements coming
2,2021-12-31,@CSmithson80 @heydave7 @BLKMDL3 @mims This cha...
3,2021-12-30,@BLKMDL3 @mims Predicting macroeconomics is ch...
4,2021-12-30,"@mims If history is any guide, not many will m..."
...,...,...
4149,2023-03-27,@CatherinScience That we extend consciousness ...
4150,2023-03-27,@cb_doge Twitter is the news
4151,2023-03-27,@Rainmaker1973 That’s what people who have the...
4152,2023-03-27,@Rainmaker1973 Reminds me of The Marker from D...


# Identify Tweets with Crypto buzzwords

In [35]:
# Take just the tweets about cryt
# Just want to grab doge for now
df_EMusk_doge = df_EMusk[df_EMusk["tweet"].str.contains('doge|Doge')]   # Cut |bitcoin|Bitcoin' 
df_EMusk_just_doge = df_EMusk_doge.copy()
df_EMusk_just_doge

Unnamed: 0,date,tweet
50,2021-12-23,@BillyM2k @jack @gladstein @rahilla @farokh @samkazemian @TheSmarmyBum @ethereum That’s why I’m ...
176,2021-12-14,Tesla will make some merch buyable with Doge &amp; see how it goes
223,2021-12-10,@GailAlfarATX @SawyerMerritt @dogecoin Imbued gold/bronze color would be sick
377,2021-11-26,@GailAlfarATX @SpaceX @RGVaerialphotos @cnunezimages @kanyewest @SciGuySpace @Erdayastronaut @do...
387,2021-11-25,@BillyM2k @WSBChairman @joannabanananaa @dogeofficialceo @GailAlfarATX @Kristennetten @greg16676...
...,...,...
4100,2023-03-28,@dogeofficialceo Where is Elvis these days?
4112,2023-03-28,@cb_doge Trying my best for the humans
4139,2023-03-27,@cb_doge That was wild
4150,2023-03-27,@cb_doge Twitter is the news


# Vader

In [27]:
def apply_vader(df):
    # Predict sentiment for each article
    df["vader_prediction"] = df["tweet"].apply(lambda text: sent_analyzer.polarity_scores(text)['compound'])

    # Regroup the df by the date and get the mean sentiment
    df_over_time = df.loc[:,['date','vader_prediction']].groupby('date').mean()
    return df_over_time

In [28]:
# Function to gives a polarity rather than numerical output
def format_output(value):
  
  polarity = "neutral"

  if(value>= 0.05):
    polarity = "positive"

  elif(value<= -0.05):
    polarity = "negative"

  return polarity

In [29]:
# Run the predictions
df_EMusk_Vader = apply_vader(df_EMusk_just_crypto)
df_EMusk_Vader

Unnamed: 0_level_0,vader_prediction
date,Unnamed: 1_level_1
2021-02-04,-0.212300
2021-02-06,0.000000
2021-02-07,0.000000
2021-02-08,-0.164550
2021-02-10,0.000000
...,...
2023-03-31,0.421500
2023-04-01,0.771200
2023-04-02,0.216743
2023-04-03,0.210750


In [30]:
# Add the polarity to the df by applying the format ouput function
df_EMusk_Vader["sentiment"] = df_EMusk_Vader["vader_prediction"].apply(lambda x: format_output(x))
df_EMusk_Vader

Unnamed: 0_level_0,vader_prediction,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-04,-0.212300,negative
2021-02-06,0.000000,neutral
2021-02-07,0.000000,neutral
2021-02-08,-0.164550,negative
2021-02-10,0.000000,neutral
...,...,...
2023-03-31,0.421500,positive
2023-04-01,0.771200,positive
2023-04-02,0.216743,positive
2023-04-03,0.210750,positive


In [33]:
# Check the distribution of polarity
df_EMusk_Vader["sentiment"].value_counts()

positive    66
neutral     62
negative    22
Name: sentiment, dtype: int64

# Load df as CSV

In [34]:
df_EMusk_Vader.to_csv("../Sentiments/EMusk_just_DOGE_since2021.csv")