# Import dependencies

In [1]:
# Dataset used: https://www.kaggle.com/datasets/gpreda/elon-musk-tweets 

In [1]:
import pandas as pd

### NTLK

In [2]:
import nltk

# Download the lexicon
#nltk.download("vader_lexicon")

# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# SentimentIntensityAnalyzer.polarity_score()function provides the polarity of the text rendering the dictionary format
# of 4 keys neg, neu, pos and compound
# neg, neu, and pos should add to 1
# Compound is overall and is between -1 and 1
# Create an instance of SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

In [4]:
# Example! 
sentence = "VADER is pretty good at identifying the underlying sentiment of a text!"
print(sent_analyzer.polarity_scores(sentence))

{'neg': 0.0, 'neu': 0.585, 'pos': 0.415, 'compound': 0.75}


# Load

### Import data from July 2022-March 2023

In [4]:
# Read in data for July 2022 - March 2023 
pd.set_option('display.max_colwidth', 100)
df_2223 = pd.read_csv("EMusk_Resources/elon_musk_tweets.csv")
df_2223.head(2)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1544379368478212100,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240855,115,13503,True,2022-07-05 17:55:09+00:00,@BillyM2k I find the gold toe sock – inevitably off kilter &amp; washed out – a little troubling...,,Twitter for iPhone,335,6542,False
1,1544377493263720450,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:47:42+00:00,"Sock Con, the conference for socks",,Twitter for iPhone,1451,30753,False


In [5]:
# cut df to just be the "date" and "text" columns
df_2223_cut = df_2223.loc[:, ["date", "text"]]
print("The number of tweets in this dataset: {}".format(len(df_2223_cut)))


The number of tweets in this dataset: 4154


In [6]:
df_2223_cut = df_2223_cut.astype({"date": "datetime64"})

df_2223_cut['date'] = df_2223_cut['date'].apply( lambda x: x.date() )

In [7]:
# Make a copy of the df and rename it to be more clear
# Rename text to tweet
df_2023 = df_2223_cut.copy()
df_2023.rename(columns={"text": "tweet"}, inplace=True)

### Import data from 2021

In [8]:
df_2021 = pd.read_csv("EMusk_Resources/2021_musk.csv")
df_2021_cut = df_2021.loc[:, ["date", "tweet"]]
df_2021_cut

Unnamed: 0,date,tweet
0,2021-12-31,@roshanpateI 🤣 $7
1,2021-12-31,@tesla_raj Many UI improvements coming
2,2021-12-31,@CSmithson80 @heydave7 @BLKMDL3 @mims This chart is a big deal
3,2021-12-30,"@BLKMDL3 @mims Predicting macroeconomics is challenging, to say the least. My gut feel is maybe ..."
4,2021-12-30,"@mims If history is any guide, not many will make it past the next recession"
...,...,...
3110,2021-01-02,@flcnhvy Tesla is responsible for 2/3 of all the personal &amp; professional pain in my life com...
3111,2021-01-02,"So proud of the Tesla team for achieving this major milestone! At the start of Tesla, I thought ..."
3112,2021-01-02,"@newscientist Um, we have giant fusion reactor in the sky that works with no maintenance"
3113,2021-01-02,@comma_ai Tesla Full Self-Driving will work at a safety level well above that of the average dri...


In [15]:
df_2021_cut = df_2021_cut.astype({"date": "datetime64"})
df_2021_cut['date'] = df_2021_cut['date'].apply( lambda x: x.date() )

### import data from 2022 and cut to before 2022/07/05 when df_justtweets starts

In [10]:
df_2022 = pd.read_csv("EMusk_Resources/2022_musk.csv")

In [18]:
df_2022_cut = df_2022.loc[:, ["date", "tweet"]]
df_2022_cut

Unnamed: 0,date,tweet
0,2022-03-05,"@SpacePadreIsle In a way, this is free QA haha"
1,2022-03-05,@teslaownersSV A beautiful home for life
2,2022-03-05,@SpacePadreIsle Some Starlink terminals near conflict areas were being jammed for several hours ...
3,2022-03-05,@RationalEtienne @RogerYar Exactly
4,2022-03-05,"@RogerYar All news sources are partially propaganda, some more than others"
...,...,...
1023,2022-01-03,https://t.co/LA9hPzVlGx
1024,2022-01-02,Let’s make the roaring 20’s happen!
1025,2022-01-02,Great work by Tesla team worldwide!
1026,2022-01-01,@BLKMDL3 @Tesla 🔥


In [20]:
# Change date to datetime and cut dataframe to before July 5
mask = (df_2022_cut['date'] < '2022-7-5')
df_2022_b4_july5 = df_2022_cut.loc[mask]

In [21]:
df_2022_b4_july5 = df_2022_b4_july5.astype({"date": "datetime64"})
df_2022_b4_july5['date'] = df_2022_b4_july5['date'].apply( lambda x: x.date() )

In [22]:
df_2022_b4_july5

Unnamed: 0,date,tweet
0,2022-03-05,"@SpacePadreIsle In a way, this is free QA haha"
1,2022-03-05,@teslaownersSV A beautiful home for life
2,2022-03-05,@SpacePadreIsle Some Starlink terminals near conflict areas were being jammed for several hours ...
3,2022-03-05,@RationalEtienne @RogerYar Exactly
4,2022-03-05,"@RogerYar All news sources are partially propaganda, some more than others"
...,...,...
1023,2022-01-03,https://t.co/LA9hPzVlGx
1024,2022-01-02,Let’s make the roaring 20’s happen!
1025,2022-01-02,Great work by Tesla team worldwide!
1026,2022-01-01,@BLKMDL3 @Tesla 🔥


# Transform

### Concat

In [23]:
# List the dfs to concat
dfs = [df_2021_cut, df_2022_b4_july5, df_2023]
df_EMusk = pd.concat(dfs)
df_EMusk

Unnamed: 0,date,tweet
0,2021-12-31,@roshanpateI 🤣 $7
1,2021-12-31,@tesla_raj Many UI improvements coming
2,2021-12-31,@CSmithson80 @heydave7 @BLKMDL3 @mims This chart is a big deal
3,2021-12-30,"@BLKMDL3 @mims Predicting macroeconomics is challenging, to say the least. My gut feel is maybe ..."
4,2021-12-30,"@mims If history is any guide, not many will make it past the next recession"
...,...,...
4149,2023-03-27,@CatherinScience That we extend consciousness to the stars and understand the Universe
4150,2023-03-27,@cb_doge Twitter is the news
4151,2023-03-27,@Rainmaker1973 That’s what people who have the woke mind virus look like to me
4152,2023-03-27,@Rainmaker1973 Reminds me of The Marker from Dead Space


In [24]:
df_EMusk = df_EMusk.astype({"date": "datetime64"})
df_EMusk['date'] = df_EMusk['date'].apply( lambda x: x.date() )
df_EMusk

Unnamed: 0,date,tweet
0,2021-12-31,@roshanpateI 🤣 $7
1,2021-12-31,@tesla_raj Many UI improvements coming
2,2021-12-31,@CSmithson80 @heydave7 @BLKMDL3 @mims This chart is a big deal
3,2021-12-30,"@BLKMDL3 @mims Predicting macroeconomics is challenging, to say the least. My gut feel is maybe ..."
4,2021-12-30,"@mims If history is any guide, not many will make it past the next recession"
...,...,...
4149,2023-03-27,@CatherinScience That we extend consciousness to the stars and understand the Universe
4150,2023-03-27,@cb_doge Twitter is the news
4151,2023-03-27,@Rainmaker1973 That’s what people who have the woke mind virus look like to me
4152,2023-03-27,@Rainmaker1973 Reminds me of The Marker from Dead Space


# Identify Tweets with Crypto buzzwords

In [25]:
# Take just the tweets about cryt
df_EMusk_crypto = df_EMusk[df_EMusk["tweet"].str.contains('doge|Doge')]   # Cut |bitcoin|Bitcoin' 
df_EMusk_just_crypto = df_EMusk_crypto.copy()

In [26]:
df_EMusk_just_crypto

Unnamed: 0,date,tweet
50,2021-12-23,@BillyM2k @jack @gladstein @rahilla @farokh @samkazemian @TheSmarmyBum @ethereum That’s why I’m ...
176,2021-12-14,Tesla will make some merch buyable with Doge &amp; see how it goes
223,2021-12-10,@GailAlfarATX @SawyerMerritt @dogecoin Imbued gold/bronze color would be sick
377,2021-11-26,@GailAlfarATX @SpaceX @RGVaerialphotos @cnunezimages @kanyewest @SciGuySpace @Erdayastronaut @do...
387,2021-11-25,@BillyM2k @WSBChairman @joannabanananaa @dogeofficialceo @GailAlfarATX @Kristennetten @greg16676...
...,...,...
4100,2023-03-28,@dogeofficialceo Where is Elvis these days?
4112,2023-03-28,@cb_doge Trying my best for the humans
4139,2023-03-27,@cb_doge That was wild
4150,2023-03-27,@cb_doge Twitter is the news


# Vader

In [27]:
def apply_vader(df):
    # Predict sentiment for each article
    df["vader_prediction"] = df["tweet"].apply(lambda text: sent_analyzer.polarity_scores(text)['compound'])

    # Regroup the df by the date and get the mean sentiment
    df_over_time = df.loc[:,['date','vader_prediction']].groupby('date').mean()
    return df_over_time

In [28]:
# Function to gives a polarity rather than numerical output
def format_output(value):
  
  polarity = "neutral"

  if(value>= 0.05):
    polarity = "positive"

  elif(value<= -0.05):
    polarity = "negative"

  return polarity

In [29]:
# Run the predictions
df_EMusk_Vader = apply_vader(df_EMusk_just_crypto)
df_EMusk_Vader

Unnamed: 0_level_0,vader_prediction
date,Unnamed: 1_level_1
2021-02-04,-0.212300
2021-02-06,0.000000
2021-02-07,0.000000
2021-02-08,-0.164550
2021-02-10,0.000000
...,...
2023-03-31,0.421500
2023-04-01,0.771200
2023-04-02,0.216743
2023-04-03,0.210750


In [30]:
df_EMusk_Vader["sentiment"] = df_EMusk_Vader["vader_prediction"].apply(lambda x: format_output(x))
df_EMusk_Vader

Unnamed: 0_level_0,vader_prediction,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-04,-0.212300,negative
2021-02-06,0.000000,neutral
2021-02-07,0.000000,neutral
2021-02-08,-0.164550,negative
2021-02-10,0.000000,neutral
...,...,...
2023-03-31,0.421500,positive
2023-04-01,0.771200,positive
2023-04-02,0.216743,positive
2023-04-03,0.210750,positive


In [33]:
df_EMusk_Vader["sentiment"].value_counts()

positive    66
neutral     62
negative    22
Name: sentiment, dtype: int64

In [22]:
df_EMusk_Vader.to_csv("../Sentiments/EMusk_just_DOGE_since2021.csv")

In [None]:
df