# Merging data frames, cleaning tweets data and compiling final data frame for analyses [CARLOS, TAMARI: NLP, MINYEONG: SpaceX and Tesla flags] 

In [1]:
# Initial imports
# import os
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import date, timedelta
from matplotlib import pyplot

In [2]:
# Display all the text in columns
pd.set_option('display.max_colwidth',None)

## Elon Musks Tweet Data
#### Start cleaning data frame for NLP and Tesla and SpaceX Flags

In [3]:
# Import csv file with tweets for elon musk
file_name='elon_tweets.csv'
file_path=Path(f"../Resources/{file_name}")
tweets_df = pd.read_csv(file_path,parse_dates=True, infer_datetime_format=True) # we do not do date as index just yet


# we change the name to date column - we will drop this field later. We need a date field that shows off market hour tweets as t+1 
tweets_df.rename(columns={'date':'date original'},inplace=True)
tweets_df['date original']=pd.to_datetime(tweets_df['date original'])

# Make tweets made after market hours fall into the following day
# Define market hour limit as everything after 16hs 00 min 00 sec
min_hour=16
min_minute=0
min_second=0

# we create the new field equalt to date original 
tweets_df['date']=tweets_df['date original'].copy()

# we add 1 day to date original if the tweet occured off market hours
tweets_df.loc[(tweets_df['date original'].dt.hour>=min_hour) & (tweets_df['date original'].dt.minute>min_minute) & (tweets_df['date original'].dt.second>min_second), 'date'] = tweets_df['date original']+timedelta(days=1)

# Drop original date and make the new date column as index
tweets_df.drop(columns={'date original'}, inplace=True)
tweets_df.set_index('date', inplace=True)

# We eliminate the seconds from Tweets data frame
tweets_df.index = tweets_df.index.date

# Create new data tweet with aggregated info
new_tweets_df=pd.DataFrame()
new_tweets_df['tweet'] = tweets_df['tweet'].groupby(tweets_df.index).agg(' '.join).sort_index()
new_tweets_df['tweet count']=tweets_df['tweet'].groupby(tweets_df.index).count().sort_index()
new_tweets_df['number likes']=tweets_df['nlikes'].groupby(tweets_df.index).sum().sort_index()
new_tweets_df['number replies']=tweets_df['nreplies'].groupby(tweets_df.index).sum().sort_index()
new_tweets_df['number retweets']=tweets_df['nretweets'].groupby(tweets_df.index).sum().sort_index()
tweets_df.head(12)

Unnamed: 0,tweet,nlikes,nreplies,nretweets
2021-01-18,@kellyreid The rate-limiting part or process in cell production is constantly changing,3120,256,119
2021-01-18,"@FrancisSuarez @CityofMiami Cars &amp; trucks stuck in traffic generate megatons of toxic gases &amp; particulate, but @boringcompany road tunnels under Miami would solve traffic &amp; be an example to the world. Spoke with @RonDeSantisFL about tunnels last week. If Governor &amp; Mayor want this done, we will do it.",8861,1566,938
2021-01-18,"@RationalEtienne @OwenSparks_ @Tesla With our giant casting machines, we are literally trying to make full-size cars in the same way that toy cars are made",6115,336,463
2021-01-18,@lexfridman @tegmark Tegmark is an exceptionally smart &amp; good human,8142,223,232
2021-01-18,"@OwenSparks_ The best manufacturing technology is in ultra high volume industries, like food &amp; beverage, some medical (eg syringes) &amp; toys",5020,210,218
2021-01-18,Battery cell production is the fundamental rate-limiter slowing down a sustainable energy future. Very important problem.,131699,5698,10424
2021-01-18,@p_ferragu Looking into this. No question that FSD should be viewed as reasonably valuable when doing a trade-in.,6171,625,274
2021-01-18,@tobyliiiiiiiiii @Erdayastronaut For sure,3852,106,79
2021-01-18,@Erdayastronaut Glad you’re ok,8142,114,150
2021-01-18,@Virgin_Orbit Congratulations!,4085,84,165


In [4]:
new_tweets_df.tail()

Unnamed: 0,tweet,tweet count,number likes,number replies,number retweets
2021-01-14,"@skorusARK Prototypes are easy, volume production is hard, positive cash flow is excruciating @Tesla Physics @Erdayastronaut Detanking &amp; inspections now. Good progress towards our “Hop in &amp; go to Mars!” goal. All three static fires completed &amp; no RUDs! @justpaulinelol @Erdayastronaut @SpaceX Wow, a lot has happened in 10 years! @OfficialJlipper Fair enough haha https://t.co/ho7yGXAS3a",7,559632,14544,38701
2021-01-15,"@SuperclusterHQ @w00ki33 Fallout New Texas @Breedlove22 @benmezrich Only Chuck Norris can divide by zero @Cerberu21014829 @Breedlove22 @benmezrich Good point @Breedlove22 @benmezrich The thing we call money is just an information system for labor allocation. What actually matters is making goods &amp; providing services. We should look at currencies from an information theory standpoint. Whichever has least error &amp; latency will win. Monty Python is amazing https://t.co/UJq94IWT88 @RationalEtienne @tobyliiiiiiiiii @Erdayastronaut They sure can twist the knife in that show! @tobyliiiiiiiiii @Erdayastronaut Probably wise @Erdayastronaut We’re making major improvements to ease of engine swap. Needs to be a few hours at most. @PPathole @johnkrausphotos @SpaceX Two of the engines need slight repairs, so will be switched out @johnkrausphotos @SpaceX Nice shot @realOmarAbdalah We don’t have high school internships, but please apply when you’re in college!",11,169339,9381,12439
2021-01-16,@DMC_Ryan @C_R_H_M @Tesla Cybertruck doesn’t need a garage @DMC_Ryan @Tesla It will be awesome @owenshift Good point @signalapp Your server-side code is doing too much @Wikipedia Happy birthday Wikipedia! So glad you exist. @TheOnion Guess you been watching Cobra Kai,6,127906,3605,4135
2021-01-17,@MrBeastYT I whistle,1,125897,1542,2171
2021-01-18,"@kellyreid The rate-limiting part or process in cell production is constantly changing @FrancisSuarez @CityofMiami Cars &amp; trucks stuck in traffic generate megatons of toxic gases &amp; particulate, but @boringcompany road tunnels under Miami would solve traffic &amp; be an example to the world. Spoke with @RonDeSantisFL about tunnels last week. If Governor &amp; Mayor want this done, we will do it. @RationalEtienne @OwenSparks_ @Tesla With our giant casting machines, we are literally trying to make full-size cars in the same way that toy cars are made @lexfridman @tegmark Tegmark is an exceptionally smart &amp; good human @OwenSparks_ The best manufacturing technology is in ultra high volume industries, like food &amp; beverage, some medical (eg syringes) &amp; toys Battery cell production is the fundamental rate-limiter slowing down a sustainable energy future. Very important problem. @p_ferragu Looking into this. No question that FSD should be viewed as reasonably valuable when doing a trade-in. @tobyliiiiiiiiii @Erdayastronaut For sure @Erdayastronaut Glad you’re ok @Virgin_Orbit Congratulations! @teslaownersSV This is a good one",11,199707,9709,13329


In [5]:
# Import csv file with tweets for elon musk
file_name='elon_tweets.csv'
file_path=Path(f"../Resources/{file_name}")
tweets_df = pd.read_csv(file_path,parse_dates=True, infer_datetime_format=True) # we do not do date as index just yet


# we change the name to date column - we will drop this field later. We need a date field that shows off market hour tweets as t+1 
tweets_df.rename(columns={'date':'date original'},inplace=True)
tweets_df['date original']=pd.to_datetime(tweets_df['date original'])

# Make tweets made after market hours fall into the following day
# Define market hour limit as everything after 16hs 00 min 00 sec
min_hour=16
min_minute=0
min_second=0

# we create the new field equalt to date original 
tweets_df['date']=tweets_df['date original'].copy()

# we add 1 day to date original if the tweet occured off market hours
tweets_df.loc[(tweets_df['date original'].dt.hour>=min_hour) & (tweets_df['date original'].dt.minute>min_minute) & (tweets_df['date original'].dt.second>min_second), 'date'] = tweets_df['date original']+timedelta(days=1)

# Drop original date and make the new date column as index
tweets_df.drop(columns={'date original'}, inplace=True)
tweets_df.set_index('date', inplace=True)

# We eliminate the seconds from Tweets data frame
tweets_df.index = tweets_df.index.date

# Cleaning tweets
# Multiple tweets per day in 1
#aggregate tweees, sum number of tweets per day, sumnumber of likes per tweet, etc
new_tweets_df = tweets_df.groupby(tweets_df.index).agg(' '.join).sort_index()
new_tweets_df['tweet count']=tweets_df['tweet'].groupby(tweets_df.index).count().sort_index()
new_tweets_df['tweet'] = tweets_df['tweet'].groupby(tweets_df.index).agg(' '.join).sort_index()
new_tweets_df['tweet count']=tweets_df['tweet'].groupby(tweets_df.index).count().sort_index()
new_tweets_df['number likes']=tweets_df['nlikes'].groupby(tweets_df.index).sum().sort_index()
new_tweets_df['number replies']=tweets_df['nreplies'].groupby(tweets_df.index).sum().sort_index()
new_tweets_df['number retweets']=tweets_df['nretweets'].groupby(tweets_df.index).sum().sort_index()



## Tesla and QQQ Stock Price Data
### Cleaning and Y generation

In [6]:
file_name='stock_price.csv'
file_path=Path(f"../Resources/{file_name}")
stock_price_df = pd.read_csv(file_path,index_col='date',parse_dates=True, infer_datetime_format=True)

# We create the Ys once the data frame is final

# We do the same format change to data  to make sure that the fields are comparable
stock_price_df.index = stock_price_df.index.date
stock_price_df.head(10)

Unnamed: 0,TSLA,QQQ,earnings flag
2010-06-29,4.778,39.031284,0
2010-06-30,4.766,38.437302,0
2010-07-01,4.392,38.329292,0
2010-07-02,3.84,38.221321,0
2010-07-06,3.222,38.338306,0
2010-07-07,3.16,39.562256,0
2010-07-08,3.492,39.77824,0
2010-07-09,3.48,40.156231,0
2010-07-12,3.41,40.27322,0
2010-07-13,3.628,40.795197,0


## Merging Dataframes
 * Stock Data + Raw Tweets 

In [7]:
# Join data frames. Outer is used to not leave any data point behind. 
 
merged_df=stock_price_df.join(new_tweets_df,how='outer')
merged_df.index = pd.to_datetime(merged_df.index)

# Keep data starting in Tesla's IPO
merged_df=merged_df.loc['2010-06-29':]

merged_df.tail()
# merged_df.loc['2011-12-01':'2011-12-06'].head(6)

Unnamed: 0,TSLA,QQQ,earnings flag,tweet,tweet count,number likes,number replies,number retweets
2021-01-16,,,,@DMC_Ryan @C_R_H_M @Tesla Cybertruck doesn’t need a garage @DMC_Ryan @Tesla It will be awesome @owenshift Good point @signalapp Your server-side code is doing too much @Wikipedia Happy birthday Wikipedia! So glad you exist. @TheOnion Guess you been watching Cobra Kai,6.0,127906.0,3605.0,4135.0
2021-01-17,,,,@MrBeastYT I whistle,1.0,125897.0,1542.0,2171.0
2021-01-18,,,,"@kellyreid The rate-limiting part or process in cell production is constantly changing @FrancisSuarez @CityofMiami Cars &amp; trucks stuck in traffic generate megatons of toxic gases &amp; particulate, but @boringcompany road tunnels under Miami would solve traffic &amp; be an example to the world. Spoke with @RonDeSantisFL about tunnels last week. If Governor &amp; Mayor want this done, we will do it. @RationalEtienne @OwenSparks_ @Tesla With our giant casting machines, we are literally trying to make full-size cars in the same way that toy cars are made @lexfridman @tegmark Tegmark is an exceptionally smart &amp; good human @OwenSparks_ The best manufacturing technology is in ultra high volume industries, like food &amp; beverage, some medical (eg syringes) &amp; toys Battery cell production is the fundamental rate-limiter slowing down a sustainable energy future. Very important problem. @p_ferragu Looking into this. No question that FSD should be viewed as reasonably valuable when doing a trade-in. @tobyliiiiiiiiii @Erdayastronaut For sure @Erdayastronaut Glad you’re ok @Virgin_Orbit Congratulations! @teslaownersSV This is a good one",11.0,199707.0,9709.0,13329.0
2021-01-19,844.549988,316.410004,0.0,,,,,
2021-01-20,850.450012,323.769989,0.0,,,,,


## Cleaning Merged Data Frame
 * Move tweets to next trading day (i.e. push forward tweets made on weekends and holidays)

In [8]:
def workdays(d, end, weekend=(6, 7)):
    days = []
    while d.date() <= end.date():
        if d.isoweekday() in weekend:
            days.append(d)
        d += datetime.timedelta(days=1)
    return days

In [9]:
# clean_df=merged_df.copy()
# target_df['TSLA']=clean_df['TSLA'].copy()
# variable_df['tweet']=clean_df['tweet'].copy()
# target='TSLA'
# target_df['{target}']

# # def holiday_adjustment(target_df, variable_df,clean_df):
# clean_df['NO_NaN']=np.where(target_df['{target}'].notnull() & variable_df.notnull(),variable_df,'')
# clean_df['NaN1']=np.where(target_df.shift(1).isnull() & variable_df.notnull().shift(1),variable_df.shift(1),'')
# clean_df['NaN2']=np.where(target_df.shift(1).isnull() & variable_df.notnull().shift(1),clean_df['NaN1'].shift(1),'')
# clean_df['NaN3']=np.where(target_df.shift(1).isnull() & variable_df.notnull().shift(1),clean_df['NaN2'].shift(1),'')
# clean_df['NaN4']=np.where(target_df.shift(1).isnull() & variable_df.notnull().shift(1),clean_df['NaN3'].shift(1),'')


# clean_df['tweet clean']= \
# clean_df['NO_NaN'][target_df.notnull()] + ' ' + \
# clean_df['NaN1'][target_df.notnull()] + ' ' + \
# clean_df['NaN2'][target_df.notnull()] + ' ' + \
# clean_df['NaN3'][clean_df['TSLA'].notnull()] + ' ' + \
# clean_df['NaN3'][clean_df['TSLA'].notnull()] 

# # Drop intermidiate columns created
# clean_df.drop(columns=['NO_NaN','NaN1','NaN2','NaN3','NaN4'],inplace=True)

# new_variable=clean_df['tweet clean'].copy()
# # return new_variable
# target_df.head()



In [10]:
clean_df=merged_df.copy()
target_df=clean_df['TSLA'].copy()
variable_df=clean_df['tweet'].copy()
holiday_adjustment(target_df, variable_df,clean_df)
new_variable.head()



NameError: name 'holiday_adjustment' is not defined

In [11]:
clean_df=merged_df.copy()

variable_df=clean_df['tweet'].copy

# weekend fix - tweets
clean_df['NO_NaN']=np.where(clean_df['TSLA'].notnull() & clean_df['tweet'].notnull(),clean_df['tweet'],'')
clean_df['NaN1']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet'].notnull().shift(1),clean_df['tweet'].shift(1),'')
clean_df['NaN2']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet'].notnull().shift(1),clean_df['NaN1'].shift(1),'')
clean_df['NaN3']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet'].notnull().shift(1),clean_df['NaN2'].shift(1),'')
clean_df['NaN4']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet'].notnull().shift(1),clean_df['NaN3'].shift(1),'')

clean_df['tweet clean']= \
    clean_df['NO_NaN'][clean_df['TSLA'].notnull()] + ' ' + \
    clean_df['NaN1'][clean_df['TSLA'].notnull()] + ' ' + \
    clean_df['NaN2'][clean_df['TSLA'].notnull()] + ' ' + \
    clean_df['NaN3'][clean_df['TSLA'].notnull()] + ' ' + \
    clean_df['NaN3'][clean_df['TSLA'].notnull()] 

# Drop intermidiate columns created
clean_df.drop(columns=['NO_NaN','NaN1','NaN2','NaN3','NaN4'],inplace=True)


# weekend fix - tweet count; create intermidiate fields that move tweet count fields to the closest next weekday 
clean_df['NO_NaN']=np.where(clean_df['TSLA'].notnull() & clean_df['tweet count'].notnull(),clean_df['tweet count'],0)
clean_df['NaN1']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet count'].notnull().shift(1),clean_df['tweet count'].shift(1),0)
clean_df['NaN2']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet count'].notnull().shift(1),clean_df['NaN1'].shift(1),0)
clean_df['NaN3']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet count'].notnull().shift(1),clean_df['NaN2'].shift(1),0)
# not needed - in case there are 4 non market days in a row
clean_df['NaN4']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['tweet count'].notnull().shift(1),clean_df['NaN3'].shift(1),0)

# summ accross intermidiate fields
clean_df['tweet count clean']= \
    clean_df['NO_NaN'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN1'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN2'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN3'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN4'][clean_df['TSLA'].notnull()].astype(int)

# Drop intermidiate columns created
clean_df.drop(columns=['NO_NaN','NaN1','NaN2','NaN3','NaN4','tweet','tweet count'],inplace=True)







In [12]:
# weekend fix - number likes; create intermidiate fields that move tweet count fields to the closest next weekday 
clean_df['NO_NaN']=np.where(clean_df['TSLA'].notnull() & clean_df['number likes'].notnull(),clean_df['number likes'],0)
clean_df['NaN1']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number likes'].notnull().shift(1),clean_df['number likes'].shift(1),0)
clean_df['NaN2']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number likes'].notnull().shift(1),clean_df['NaN1'].shift(1),0)
clean_df['NaN3']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number likes'].notnull().shift(1),clean_df['NaN2'].shift(1),0)
# not needed - in case there are 4 non market days in a row
clean_df['NaN4']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number likes'].notnull().shift(1),clean_df['NaN3'].shift(1),0)

# summ accross intermidiate fields
clean_df['number likes clean']= \
    clean_df['NO_NaN'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN1'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN2'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN3'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN4'][clean_df['TSLA'].notnull()].astype(int)

# Drop intermidiate columns created
clean_df.drop(columns=['NO_NaN','NaN1','NaN2','NaN3','NaN4','number likes'],inplace=True)


In [13]:
# weekend fix - number likes; create intermidiate fields that move tweet count fields to the closest next weekday 
clean_df['NO_NaN']=np.where(clean_df['TSLA'].notnull() & clean_df['number replies'].notnull(),clean_df['number replies'],0)
clean_df['NaN1']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number replies'].notnull().shift(1),clean_df['number replies'].shift(1),0)
clean_df['NaN2']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number replies'].notnull().shift(1),clean_df['NaN1'].shift(1),0)
clean_df['NaN3']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number replies'].notnull().shift(1),clean_df['NaN2'].shift(1),0)
# not needed - in case there are 4 non market days in a row
clean_df['NaN4']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number replies'].notnull().shift(1),clean_df['NaN3'].shift(1),0)

# summ accross intermidiate fields
clean_df['number replies clean']= \
    clean_df['NO_NaN'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN1'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN2'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN3'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN4'][clean_df['TSLA'].notnull()].astype(int)

# Drop intermidiate columns created
clean_df.drop(columns=['NO_NaN','NaN1','NaN2','NaN3','NaN4','number replies'],inplace=True)

In [14]:
# weekend fix - number likes; create intermidiate fields that move tweet count fields to the closest next weekday 
clean_df['NO_NaN']=np.where(clean_df['TSLA'].notnull() & clean_df['number retweets'].notnull(),clean_df['number retweets'],0)
clean_df['NaN1']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number retweets'].notnull().shift(1),clean_df['number retweets'].shift(1),0)
clean_df['NaN2']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number retweets'].notnull().shift(1),clean_df['NaN1'].shift(1),0)
clean_df['NaN3']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number retweets'].notnull().shift(1),clean_df['NaN2'].shift(1),0)
# not needed - in case there are 4 non market days in a row
clean_df['NaN4']=np.where(clean_df['TSLA'].shift(1).isnull() & clean_df['number retweets'].notnull().shift(1),clean_df['NaN3'].shift(1),0)

# summ accross intermidiate fields
clean_df['number retweets clean']= \
    clean_df['NO_NaN'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN1'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN2'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN3'][clean_df['TSLA'].notnull()].astype(int) + \
    clean_df['NaN4'][clean_df['TSLA'].notnull()].astype(int)

# Drop intermidiate columns created
clean_df.drop(columns=['NO_NaN','NaN1','NaN2','NaN3','NaN4','number retweets'],inplace=True)

In [15]:
# Add NaNs in empty strings
clean_df['tweet clean'].replace(r'^\s*$',np.NaN,inplace=True, regex=True)

# Drop weekends and holidays; i.e. days with no market data
clean_df.dropna(subset=['TSLA'],inplace=True)

clean_df.tail()

Unnamed: 0,TSLA,QQQ,earnings flag,tweet clean,tweet count clean,number likes clean,number replies clean,number retweets clean
2021-01-13,854.409973,316.040009,0.0,"@NASASpaceflight Sea shanty tiktok takes it to a new level with actual pirate ships @ConnectDotsToo Sure Today at SpaceX is about practicing Starship engine starts. Ship is held down by massive pins while engines are fired. Two starts completed, about to try a third. @Tesmanian_com As promised Legalize comedy @lvladimirovBG You can steal our name/logos &amp; we probably won’t sue you @arstechnica @SciGuySpace We’re just trying to get people to Mars. Help would be appreciated. @lexfridman Suicide is more than double homicide https://t.co/MvWGPs9uQE @lexfridman Absolutely Hey you … Yeah you Queen … You’re gonna make it! 💕💕 https://t.co/LrqdIrbIyd @TheOldManPar @DJSnM @MachinePix Hate to say it, but might be true",11.0,1734427.0,40100.0,213435.0
2021-01-14,845.0,314.350006,0.0,"@skorusARK Prototypes are easy, volume production is hard, positive cash flow is excruciating @Tesla Physics @Erdayastronaut Detanking &amp; inspections now. Good progress towards our “Hop in &amp; go to Mars!” goal. All three static fires completed &amp; no RUDs! @justpaulinelol @Erdayastronaut @SpaceX Wow, a lot has happened in 10 years! @OfficialJlipper Fair enough haha https://t.co/ho7yGXAS3a",7.0,559632.0,14544.0,38701.0
2021-01-15,826.159973,311.859985,0.0,"@SuperclusterHQ @w00ki33 Fallout New Texas @Breedlove22 @benmezrich Only Chuck Norris can divide by zero @Cerberu21014829 @Breedlove22 @benmezrich Good point @Breedlove22 @benmezrich The thing we call money is just an information system for labor allocation. What actually matters is making goods &amp; providing services. We should look at currencies from an information theory standpoint. Whichever has least error &amp; latency will win. Monty Python is amazing https://t.co/UJq94IWT88 @RationalEtienne @tobyliiiiiiiiii @Erdayastronaut They sure can twist the knife in that show! @tobyliiiiiiiiii @Erdayastronaut Probably wise @Erdayastronaut We’re making major improvements to ease of engine swap. Needs to be a few hours at most. @PPathole @johnkrausphotos @SpaceX Two of the engines need slight repairs, so will be switched out @johnkrausphotos @SpaceX Nice shot @realOmarAbdalah We don’t have high school internships, but please apply when you’re in college!",11.0,169339.0,9381.0,12439.0
2021-01-19,844.549988,316.410004,0.0,"@kellyreid The rate-limiting part or process in cell production is constantly changing @FrancisSuarez @CityofMiami Cars &amp; trucks stuck in traffic generate megatons of toxic gases &amp; particulate, but @boringcompany road tunnels under Miami would solve traffic &amp; be an example to the world. Spoke with @RonDeSantisFL about tunnels last week. If Governor &amp; Mayor want this done, we will do it. @RationalEtienne @OwenSparks_ @Tesla With our giant casting machines, we are literally trying to make full-size cars in the same way that toy cars are made @lexfridman @tegmark Tegmark is an exceptionally smart &amp; good human @OwenSparks_ The best manufacturing technology is in ultra high volume industries, like food &amp; beverage, some medical (eg syringes) &amp; toys Battery cell production is the fundamental rate-limiter slowing down a sustainable energy future. Very important problem. @p_ferragu Looking into this. No question that FSD should be viewed as reasonably valuable when doing a trade-in. @tobyliiiiiiiiii @Erdayastronaut For sure @Erdayastronaut Glad you’re ok @Virgin_Orbit Congratulations! @teslaownersSV This is a good one @MrBeastYT I whistle @DMC_Ryan @C_R_H_M @Tesla Cybertruck doesn’t need a garage @DMC_Ryan @Tesla It will be awesome @owenshift Good point @signalapp Your server-side code is doing too much @Wikipedia Happy birthday Wikipedia! So glad you exist. @TheOnion Guess you been watching Cobra Kai @DMC_Ryan @C_R_H_M @Tesla Cybertruck doesn’t need a garage @DMC_Ryan @Tesla It will be awesome @owenshift Good point @signalapp Your server-side code is doing too much @Wikipedia Happy birthday Wikipedia! So glad you exist. @TheOnion Guess you been watching Cobra Kai",18.0,453510.0,14856.0,19635.0
2021-01-20,850.450012,323.769989,0.0,,0.0,0.0,0.0,0.0


In [17]:
# Export clean tweets files to run NLP and SpaceX and Tesla Flags

save_csv=False

if save_csv==True:
    # Create clean data frame in the same format than the original
    clean_tweets_df=pd.DataFrame(clean_df['tweet clean'])
    clean_tweets_df.index.name='date'
    clean_tweets_df.rename(columns={'tweet clean':'tweet'},inplace=True)
    clean_tweets_df.head()

    # Save data frame in csv file
    file_name="clean_elon_tweets_vF.csv"
    output_file = Path(f"../Resources/{file_name}")
    clean_tweets_df.to_csv(f"{output_file}")

## Merging NLP dataframes

In [18]:
#Import CSV with vader sentiment score
file_name='tokens_n_vader_sentiment.csv'
file_path=Path(f"../Resources/{file_name}")
vader_sentiment_df = pd.read_csv(file_path,index_col='date',parse_dates=True, infer_datetime_format=True) # we do not do date as index just yet

# Rename columns to conform to style in final data frame
vader_sentiment_df.rename(columns={'cleaned_tweet':'cleaned tweet','Compound':'compound','Positive':'positive','Negative':'negative','Neutral':'neutral'}, inplace=True)

# Drop columns already included in 'master' data frame
vader_sentiment_df.drop(columns={'tweet', 'tokens','cleaned tweet'}, inplace=True)

# Merge NLP dataframe with rest of fields
clean_df=clean_df.join(vader_sentiment_df,how='outer')



Unnamed: 0_level_0,TSLA,QQQ,earnings flag,tweet clean,tweet count clean,number likes clean,number replies clean,number retweets clean,compound,positive,negative,neutral,Sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-01-15,826.159973,311.859985,0.0,"@SuperclusterHQ @w00ki33 Fallout New Texas @Breedlove22 @benmezrich Only Chuck Norris can divide by zero @Cerberu21014829 @Breedlove22 @benmezrich Good point @Breedlove22 @benmezrich The thing we call money is just an information system for labor allocation. What actually matters is making goods &amp; providing services. We should look at currencies from an information theory standpoint. Whichever has least error &amp; latency will win. Monty Python is amazing https://t.co/UJq94IWT88 @RationalEtienne @tobyliiiiiiiiii @Erdayastronaut They sure can twist the knife in that show! @tobyliiiiiiiiii @Erdayastronaut Probably wise @Erdayastronaut We’re making major improvements to ease of engine swap. Needs to be a few hours at most. @PPathole @johnkrausphotos @SpaceX Two of the engines need slight repairs, so will be switched out @johnkrausphotos @SpaceX Nice shot @realOmarAbdalah We don’t have high school internships, but please apply when you’re in college!",11.0,169339.0,9381.0,12439.0,0.9429,0.182,0.0,0.818,0.0
2021-01-19,844.549988,316.410004,0.0,"@kellyreid The rate-limiting part or process in cell production is constantly changing @FrancisSuarez @CityofMiami Cars &amp; trucks stuck in traffic generate megatons of toxic gases &amp; particulate, but @boringcompany road tunnels under Miami would solve traffic &amp; be an example to the world. Spoke with @RonDeSantisFL about tunnels last week. If Governor &amp; Mayor want this done, we will do it. @RationalEtienne @OwenSparks_ @Tesla With our giant casting machines, we are literally trying to make full-size cars in the same way that toy cars are made @lexfridman @tegmark Tegmark is an exceptionally smart &amp; good human @OwenSparks_ The best manufacturing technology is in ultra high volume industries, like food &amp; beverage, some medical (eg syringes) &amp; toys Battery cell production is the fundamental rate-limiter slowing down a sustainable energy future. Very important problem. @p_ferragu Looking into this. No question that FSD should be viewed as reasonably valuable when doing a trade-in. @tobyliiiiiiiiii @Erdayastronaut For sure @Erdayastronaut Glad you’re ok @Virgin_Orbit Congratulations! @teslaownersSV This is a good one @MrBeastYT I whistle @DMC_Ryan @C_R_H_M @Tesla Cybertruck doesn’t need a garage @DMC_Ryan @Tesla It will be awesome @owenshift Good point @signalapp Your server-side code is doing too much @Wikipedia Happy birthday Wikipedia! So glad you exist. @TheOnion Guess you been watching Cobra Kai @DMC_Ryan @C_R_H_M @Tesla Cybertruck doesn’t need a garage @DMC_Ryan @Tesla It will be awesome @owenshift Good point @signalapp Your server-side code is doing too much @Wikipedia Happy birthday Wikipedia! So glad you exist. @TheOnion Guess you been watching Cobra Kai",18.0,453510.0,14856.0,19635.0,,,,,
2021-01-20,850.450012,323.769989,0.0,,0.0,0.0,0.0,0.0,,,,,


## Merging SpaceX and Tesla Flags - [MINYEONG]

In [19]:
#Import CSV created by Minyeong or fancier if we use functions?
# Merge flags dataframe with rest of fields
clean_df['key word 1']=np.NaN
clean_df['key word 2']=np.NaN
clean_df['key word 3']=np.NaN
clean_df['key word 4']=np.NaN


# CARLOS: ask Jeff how to replace all 0s in vader sentiment by nulls
## Finalizing Final Data Frame 

In [25]:
# renaming fields
clean_df.rename(columns={'tweet clean':'tweet','tweet count clean':'tweet count','number likes clean':'number likes','number replies clean':'number replies','number replies clean':'number replies'},inplace=True)

# Creating Y variables
clean_df['Y_d1_pr_change_diff']=clean_df['TSLA'].pct_change()-clean_df['QQQ'].pct_change()
clean_df['Y_d5_pr_change_diff']=clean_df['TSLA'].pct_change(periods=5)-clean_df['QQQ'].pct_change(periods=5)

# Creating 1 / 0 flag for tweet made
clean_df['tweet flag']=np.where(clean_df['tweet'].notnull(),1,0) 

# When there is no tweet, make vader sentiment score of 0 null [CARLOS: JEFF]
# clean_df[['compound','positive','negative','neutral']]=np.where(clean_df['tweet'].isnull(),np.NaN)

# Re order the data frame
# clean_df=clean_df[['TSLA','QQQ','Y_d1_pr_change_diff','Y_d5_pr_change_diff','earnings flag','tweet count', 'tweet flag','compound','positive','negative','neutral','tesla flag','spacex flag']]

In [26]:
clean_df.head()

Unnamed: 0_level_0,TSLA,QQQ,earnings flag,tweet,tweet count,number likes,number replies,number retweets clean,compound,positive,negative,neutral,Sentiment,key word 1,key word 2,key word 3,key word 4,Y_d1_pr_change_diff,Y_d5_pr_change_diff,tweet flag
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-06-29,4.778,39.031284,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0
2010-06-30,4.766,38.437302,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.012707,,0
2010-07-01,4.392,38.329292,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-0.075662,,0
2010-07-02,3.84,38.221321,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-0.122866,,0
2010-07-06,3.222,38.338306,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,-0.163998,,0


In [28]:
def movecol(df, cols_to_move=[], ref_col='', place='After'):
    cols = df.columns.tolist()
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]
    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]
    return(df[seg1 + seg2 + seg3])

clean_df = movecol(clean_df,cols_to_move=['Y_d1_pr_change_diff', 'Y_d5_pr_change_diff'],ref_col='QQQ',place='After')
clean_df.head()

In [None]:
# Export csv files with final data frame for machine learning
file_name="final_data_frame.csv"
output_file = Path(f"../Resources/{file_name}")
clean_df.to_csv(f"{output_file}")

# DRAFT - to be deleted 
# Charts - Team discussion Only 

In [None]:
clean_df['tweet count'].plot(figsize=(15,10), title='Number of tweets per day')

In [None]:
clean_df.plot.scatter(x='Y_d1_pr_change_diff',
                      y='tweet flag',
                      figsize=(15,10)
                      )

In [None]:
clean_df.describe()
clean_df['Y_d1_pr_change_diff'].loc[clean_df['tweet flag']==0].count()

In [None]:
# Histograms of pct change differenctial - 5 days change
x=clean_df['Y_d1_pr_change_diff'].loc[clean_df['tweet flag']==1]
y=clean_df['Y_d1_pr_change_diff'].loc[clean_df['tweet flag']==0]
bins = np.linspace(-0.2, 0.2)

pyplot.figure(figsize=(20,8))
pyplot.title('a')
pyplot.hist(x, bins, alpha=0.5, label='tweet made',weights = np.ones_like(x) / len(x))
pyplot.hist(y, bins, alpha=0.5, label='no tweets',weights = np.ones_like(y) / len(y))
pyplot.legend(loc='upper right')
pyplot.show()

In [None]:
x.count()

In [None]:
y.count()

In [None]:
x.mean()

In [None]:
y.mean()

In [None]:
# Export csv files with stock prices
# file_name="test1.csv"
# output_file = Path(f"../Resources/{file_name}")
# clean_df.to_csv(f"{output_file}")