In [41]:
import pandas as pd
import numpy as np
from datetime import datetime 
pd.set_option('display.max_columns', 1000, 'display.max_colwidth', 1000, 'display.max_rows',1000)

# Data Read "elon-musk-tweets" file

In [42]:
df1 = pd.read_csv('data/elon-musk-tweets.csv')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13610 entries, 0 to 13609
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            13590 non-null  object
 1   username        13610 non-null  object
 2   linktotweet     13610 non-null  object
 3   tweetembedcode  13610 non-null  object
 4   createdat       13610 non-null  object
dtypes: object(5)
memory usage: 531.8+ KB


In [43]:
start = '2015-03-15'
end = '2022-03-14'

In [44]:
df1 = df1[(df1.createdat >= start) & (df1.createdat <= end)][['text','createdat']]\
    .sort_values(by='createdat', ascending=False)\
    .reset_index(drop = True)
df1.rename(columns={'text':'Content', 'createdat':'Time'}, inplace = True)

def date_time(x):
    x = str.split(x,sep='T')
    time = str.split(x[0],sep="-")
    year = int(time[0])
    mon = int(time[1])
    day = int(time[2])

    time = str.split(x[1],sep=":")
    hr = int(time[0])
    min = int(time[1])
    sec = int(time[2])
    sec = int('0')

    date = datetime(year=year,month=mon,day=day,hour=hr, minute=min, second=sec)
    
    return date

df1['Time'] = df1.Time.apply(lambda x: date_time(x))

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13610 entries, 0 to 13609
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Content  13590 non-null  object        
 1   Time     13610 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 212.8+ KB


# Data Read "Elon Musk 2" file

In [45]:
df2 = pd.read_csv('data/Elon Musk 2.csv')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7918 entries, 0 to 7917
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   PostID         7918 non-null   int64 
 1   Time           7918 non-null   object
 2   TweetUrl       7918 non-null   object
 3   Content        7918 non-null   object
 4   UserID         7918 non-null   int64 
 5   UserName       7918 non-null   object
 6   RetweetNum     7918 non-null   int64 
 7   LikeNum        7918 non-null   int64 
 8   ReplyNum       7918 non-null   int64 
 9   UserHandle     7918 non-null   object
 10  UserUrl        7918 non-null   object
 11  Location       112 non-null    object
 12  UserID_PostID  7918 non-null   object
 13  CurrentPage    7918 non-null   int64 
dtypes: int64(6), object(8)
memory usage: 866.2+ KB


In [46]:
from datetime import datetime
months_in_year = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
months_in_year.index('Jan')

df2 = df2[['Content','Time']].sort_values(by='Time')

def time_convert(x):
    x = str.split(x)
    year = int(x[-1])
    mon = int(months_in_year.index(x[1])+1)
    day = int(x[2])
    time = x[3].split(':')
    hr = int(time[0])
    min = int(time[1])
    sec = int(time[2])
    sec = int('0')
    date = datetime(year=year,month=mon,day=day,hour=hr, minute=min, second=sec)
    return date

df2['Time'] = df2.Time.apply(lambda x: time_convert(x))

df2 = df2[(df2.Time >= start) & (df2.Time <= end)].sort_values(by='Time',ascending=False)
df2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7917 entries, 511 to 556
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Content  7917 non-null   object        
 1   Time     7917 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 185.6+ KB


# Combine two dataset

In [48]:
data = df1.append(df2).drop_duplicates(subset=['Time']).reset_index(drop=False)
print(("min Time: "+str(data.Time.min())))
print(("max Time: "+str(data.Time.max())))
data.to_csv("data/combined_data.csv", sep=',', index=False)
data.info()

min Time: 2017-06-19 09:32:00
max Time: 2022-03-11 19:41:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17347 entries, 0 to 17346
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   index    17347 non-null  int64         
 1   Content  17327 non-null  object        
 2   Time     17347 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 406.7+ KB


  data = df1.append(df2).drop_duplicates(subset=['Time']).reset_index(drop=False)


## Tweet Content Cleaning

In [436]:
df = pd.read_csv('data/combined_data.csv').iloc[:,1:]
df['Time'] = df.Time.apply(pd.to_datetime)
df = df.dropna(how='any')
df.head()

Unnamed: 0,Content,Time
0,"@haltman Little do they know, birds aren't real! https://t.co/mBPzUQMxRN",2022-03-11 19:41:00
1,@waitbutwhy What if one atom at a time in your body was switched out for another atom? There would be you with all new atoms and another you with the original atoms - Human of Theseus.,2022-03-11 00:31:00
2,@WholeMarsBlog @Erdayastronaut @CopSub This weekend,2022-03-10 05:44:00
3,"@FedorovMykhailo @OMarkarova You're welcome. We have also sent power adapters for car cigarette lighters, solar/battery packs and generators for places where electricity is not available.",2022-03-09 21:49:00
4,"@PPathole @SpaceX Optimized, fully-reusable Starship is ~150t to same reference orbit as Saturn V. In expendable mode, Starship payload would be 250t to 300t.",2022-03-09 21:41:00


In [437]:
# To clean up texts
import re
import nltk
# nltk.download() Download nltk data for first time use (download all packages)
import nltk.data
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('punkt')
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [438]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def sentence_to_wordlist(sentence, remove_stopwords=False):
    # 0. remove mentions(@), Hashtag(#)
    sentence = re.sub(r'@[^\s]+', '', sentence,flags=re.MULTILINE )
    sentence = re.sub(r'#[^\s]+', '', sentence,flags=re.MULTILINE )
    sentence = re.sub(r'RT[^\s]+', '', sentence,flags=re.MULTILINE )
    # 1. drop http
    p1=re.compile(r'http?:\/\/\S+', flags=re.DOTALL)
    sentence = re.sub(p1, '', sentence)
    # 2. drop https
    p1=re.compile(r'https?:\/\/\S+', flags=re.DOTALL)
    sentence = re.sub(p1, '', sentence)
    # 3. Remove non-letters
    sentence = re.sub(r'[^\w\s]','', sentence)
    # 4. Remove all numbers
    sentence = re.sub(r'[0-9]+', '', sentence)
    # 5. Convert words to lower case and split them
    sentence = sentence.lower().split()
    # Remove Stop Words
    # sentence = [word for word in sentence if not word in stop_words]
    # 5. Stemming
    # sentence = [stemmer.stem(w) for w in sentence] 
    # 6. Lemmatizing
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    # 7. Return a list of words
    sentence_r = ''
    for word in sentence:
      sentence_r = sentence_r + ' ' + word
    return(sentence_r)

def cleanText(text):
  text = re.sub('@[A-Za-z0-9]+' , '', text) #remove the @mentions
  text = re.sub('https?:\/\/\S+' , '', text) #remove the hyperlinks
  text = re.sub(r'#', '', text) #removing #tags
  text = re.sub(r'RT[\s]+', '', text) #removing RT(ReTweets)
  return text

In [439]:
# create a new column 
df['Content1']= df.Content.apply(lambda row: sentence_to_wordlist(row))
# df[['Time','Content1','Content']].iloc[-5:,:]

# Sentiment Analysis

In [440]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def polarityScore(a, col):
    # Create a column for score
    a['score'] = a[col].apply(lambda row: calculate_polarity_scores(row))
    #split all the scores into separate columns
    dfa = pd.DataFrame(a['score'].to_list(), columns = ['neg', 'neu', 'pos', 'compound'])
    # concat the results into the table with tweets
    result = pd.concat([a, dfa], axis=1).dropna(how='any').sort_values(by='Time')

    return result

def calculate_polarity_scores(x):
    sid = SentimentIntensityAnalyzer()
    score = sid.polarity_scores(x)
    score_list = []
    for i in score.values():
        score_list.append(i)
        
    return score_list


import matplotlib.pyplot as plt
result = polarityScore(df, 'Content1')
import seaborn as sns

sns.histplot(data=result, x='compound',bins=10)

In [370]:
import seaborn as sns 
result = polarityScore(df, 'Content1')
# sns.histplot(data=result, x='compound')
result.head()

Unnamed: 0,Content,Time,Content1,score,neg,neu,pos,compound
13264,@highqualitysh1t I love the thought of a car drifting apparently endlessly through space and perhaps being discovered by an alien race millions of years in the future,2017-12-02 19:33:00,i love the thought of a car drifting apparently endlessly through space and perhaps being discovered by an alien race million of year in the future,"[0.0, 0.846, 0.154, 0.6369]",0.106,0.736,0.158,0.2263
13263,@novaspivack Asimov's Foundation books should def be part of the mission. They're amazing.,2017-12-02 22:46:00,asimov foundation book should def be part of the mission theyre amazing,"[0.0, 0.743, 0.257, 0.5859]",0.000,0.000,0.000,0.0000
13262,@novaspivack That's certainly the right way to go to store massive amounts of data for a long time,2017-12-03 00:01:00,thats certainly the right way to go to store massive amount of data for a long time,"[0.0, 0.862, 0.138, 0.34]",0.000,0.705,0.295,0.5574
13261,"To preserve the transcendent majesty &amp; specialness of The Boring Company cap, we are capping cap orders at 50,000 caps. Almost there ... https://t.co/YqjEQAfy3u",2017-12-03 19:05:00,to preserve the transcendent majesty amp specialness of the boring company cap we are capping cap order at cap almost there,"[0.103, 0.897, 0.0, -0.3182]",0.000,0.714,0.286,0.3400
13260,@harrisonlingren @JW8888888 Busted,2017-12-03 19:07:00,busted,"[0.0, 1.0, 0.0, 0.0]",0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...
4,"@PPathole @SpaceX Optimized, fully-reusable Starship is ~150t to same reference orbit as Saturn V. In expendable mode, Starship payload would be 250t to 300t.",2022-03-09 21:41:00,optimized fullyreusable starship is t to same reference orbit a saturn v in expendable mode starship payload would be t to t,"[0.0, 0.842, 0.158, 0.4588]",0.000,0.842,0.158,0.4588
3,"@FedorovMykhailo @OMarkarova You're welcome. We have also sent power adapters for car cigarette lighters, solar/battery packs and generators for places where electricity is not available.",2022-03-09 21:49:00,youre welcome we have also sent power adapter for car cigarette lighter solarbattery pack and generator for place where electricity is not available,"[0.0, 0.88, 0.12, 0.4588]",0.000,0.880,0.120,0.4588
2,@WholeMarsBlog @Erdayastronaut @CopSub This weekend,2022-03-10 05:44:00,this weekend,"[0.0, 1.0, 0.0, 0.0]",0.000,1.000,0.000,0.0000
1,@waitbutwhy What if one atom at a time in your body was switched out for another atom? There would be you with all new atoms and another you with the original atoms - Human of Theseus.,2022-03-11 00:31:00,what if one atom at a time in your body wa switched out for another atom there would be you with all new atom and another you with the original atom human of theseus,"[0.0, 0.933, 0.067, 0.3182]",0.000,0.933,0.067,0.3182


In [441]:
df_trun = df[(df.Time>= '2021-01-01') & (df.Time <='2021-01-05')].reset_index(drop=True)
# df_trun
result_trun = polarityScore(df_trun.dropna(how = 'any'), 'Content1')

In [442]:
from datetime import datetime
def timeframe(x):
    x = 1
    return x

result_trun['code'] = result_trun.Time.apply(lambda row: timeframe(row))
result_trun[['Time','code']].head()

Unnamed: 0,Time,code
9,2021-01-01 00:58:00,1
8,2021-01-02 03:20:00,1
7,2021-01-02 12:23:00,1
10,2021-01-02 14:32:00,1
6,2021-01-02 14:46:00,1


In [473]:
# Get holidays
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start='2016-01-01', end='2022-12-31').to_pydatetime()
holiday_date = []
for days in holidays:
    holiday_date.append(days.date())

#check weekend or US holiday
def weekendHoliday(x):
    date = x.date()
    weekday = x.weekday()
    if (date in holiday_date) | (weekday==5) | (weekday==6):
        out = 0
    else:
        out = 1
    return out

result_trun['code'] = result_trun.Time.apply(lambda row: weekendHoliday(row))
result_trun[['Time','code']]

Unnamed: 0,Time,code
9,2021-01-01 00:58:00,0
8,2021-01-02 03:20:00,0
7,2021-01-02 12:23:00,0
10,2021-01-02 14:32:00,0
6,2021-01-02 14:46:00,0
5,2021-01-02 14:51:00,0
4,2021-01-02 14:59:00,0
3,2021-01-02 15:07:00,0
2,2021-01-03 00:04:00,0
1,2021-01-04 20:26:00,1


In [457]:
df.Time.info()

<class 'pandas.core.series.Series'>
Int64Index: 17327 entries, 0 to 17346
Series name: Time
Non-Null Count  Dtype         
--------------  -----         
17327 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 786.8 KB
