In [1]:
import pandas as pd

all_tweets = pd.read_csv('TSLA-dataset-37422.csv') # dataset to train the SVM on
print(f'Length of the data frame is: {len(all_tweets)}')
print(f'Shape of the data frame is:{all_tweets.shape}')
print(f'First entries of the data frame are:{all_tweets.head()}')

Length of the data frame is: 37422
Shape of the data frame is:(37422, 4)
First entries of the data frame are:                        Date  \
0  2022-09-29 23:41:16+00:00   
1  2022-09-29 23:24:43+00:00   
2  2022-09-29 23:18:08+00:00   
3  2022-09-29 22:40:07+00:00   
4  2022-09-29 22:27:05+00:00   

                                               Tweet Stock Name Company Name  
0  Mainstream media has done an amazing job at br...       TSLA  Tesla, Inc.  
1  Tesla delivery estimates are at around 364k fr...       TSLA  Tesla, Inc.  
2  3/ Even if I include 63.0M unvested RSUs as of...       TSLA  Tesla, Inc.  
3  @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...       TSLA  Tesla, Inc.  
4  @RealDanODowd @Tesla Stop trying to kill kids,...       TSLA  Tesla, Inc.  


In [2]:
# 1.1::
"""
all_tweets = all_tweets.drop(columns=['Company Name']) # no need for company name, remove it
all_tweets["sentiment_score"] = '' # add data frame for sentiment score

# make data frames for all stocks we're looking at:

stock_names = ['TSLA', 'AMZN', 'MSFT', 'TSM']

for stock in stock_names:
    stock_df = all_tweets[all_tweets['Stock Name'] == stock]
    stock_df.to_csv(f'filtered-stock-dataframes/{stock}-filtered-{len(stock_df)}.csv', index=False)
"""

'\nall_tweets = all_tweets.drop(columns=[\'Company Name\']) # no need for company name, remove it\nall_tweets["sentiment_score"] = \'\' # add data frame for sentiment score\n\n# make data frames for all stocks we\'re looking at:\n\nstock_names = [\'TSLA\', \'AMZN\', \'MSFT\', \'TSM\']\n\nfor stock in stock_names:\n    stock_df = all_tweets[all_tweets[\'Stock Name\'] == stock]\n    stock_df.to_csv(f\'filtered-stock-dataframes/{stock}-filtered-{len(stock_df)}.csv\', index=False)\n'

In [3]:
# 1.2::
from textblob import Word
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_tweets(tweet):
    processed_tweet = tweet.lower()
    processed_tweet = re.sub(r'http\S+', '', processed_tweet)
    processed_tweet = re.sub(r'@\w+', '', tweet)
    processed_tweet.replace('[^\w\s]', '')
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in stopwords.words('english'))
    processed_tweet = " ".join(Word(word).lemmatize() for word in processed_tweet.split())
    return processed_tweet

all_tweets['Tweet'] = all_tweets['Tweet'].apply(preprocess_tweets)

nltk.downloader.download('vader_lexicon')
sentiment_analyzer = SentimentIntensityAnalyzer()

for index, row in all_tweets.T.iteritems():
    try:
        sentence_i = unicodedata.normalize('NFKD', all_tweets.loc[index, 'Tweet'])
        sentence_sentiment = sentiment_analyzer.polarity_scores(sentence_i)
        all_tweets.at[index, 'sentiment_score'] = sentence_sentiment['compound']
    except TypeError as e:
        print(f'error: {e}')
        break

print(f'First entries of the data frame with sentiment added: {all_tweets.head()}')

# convert to binary sentiments:

def assign_binary_sentiment(x):
    if x > 0.05: return 1
    elif x < 0.05: return 0
    else: return None
    
all_tweets['binary_sentiment'] = all_tweets['sentiment_score'].apply(assign_binary_sentiment)
all_tweets.to_csv(f'all_tweets_sentiment.csv', index=False) # save edited dataframe:

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tarkojuss/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tarkojuss/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tarkojuss/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


First entries of the data frame with sentiment added:                         Date  \
0  2022-09-29 23:41:16+00:00   
1  2022-09-29 23:24:43+00:00   
2  2022-09-29 23:18:08+00:00   
3  2022-09-29 22:40:07+00:00   
4  2022-09-29 22:27:05+00:00   

                                               Tweet Stock Name Company Name  \
0  Mainstream medium done amazing job brainwashin...       TSLA  Tesla, Inc.   
1  Tesla delivery estimate around 364k analysts. ...       TSLA  Tesla, Inc.   
2  3/ Even I include 63.0M unvested RSUs 6/30, ad...       TSLA  Tesla, Inc.   
3  Hahaha still trying stop Tesla FSD bro! Get sh...       TSLA  Tesla, Inc.   
4        Stop trying kill kids, sad deranged old man       TSLA  Tesla, Inc.   

   sentiment_score  
0           0.0772  
1           0.0000  
2           0.2960  
3          -0.4559  
4          -0.8750  


In [4]:
# 1.3::

# full data frame split:

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

svm_data = pd.read_csv('all_tweets_sentiment.csv')

svm_data.dropna(subset=['binary_sentiment'], inplace=True)

y = svm_data['binary_sentiment'].values

x_train, x_test, y_train, y_test = train_test_split(svm_data['Tweet'].values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)


In [5]:
import numpy as np
# vectorize the full data frame:

vectorizer = CountVectorizer(binary=True, stop_words='english') # vectorize the data rame

x_train = np.where(pd.isnull(x_train), '', x_train)
x_test = np.where(pd.isnull(x_test), '', x_test)

vectorizer.fit(list(x_train) + list(x_test)) # learn a vocab

x_train_vec = vectorizer.transform(x_train) # transform documents to document-term matrix
x_test_vec = vectorizer.transform(x_test)

# pd.DataFrame(x_train_vec.toarray(), columns = vectorizer.get_feature_names())

In [6]:
# 1.3::

# train the SVM classifier:

svm = svm.SVC(kernel = 'linear', C = 1)
prob = svm.fit(x_train_vec, y_train)
y_pred_svm = svm.predict(x_test_vec)

print("Overall accuracy score for the SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')


Overall accuracy score for the SVC is:  88.607820432885 %


In [7]:
# 1.4::
predictions = []
svm_data['Tweet'] = np.where(pd.isnull(svm_data['Tweet']), '', svm_data['Tweet'])
for tweet in svm_data['Tweet']:
    tweet_vec = vectorizer.transform([tweet])
    prediction = svm.predict(tweet_vec)[0]
    predictions.append(prediction)

svm_data['prediction'] = predictions

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

report = classification_report(y_test, y_pred_svm)
confusion = confusion_matrix(y_test, y_pred_svm)

print(confusion)
print(report)

svm_data.to_csv(f'final.csv', index=False)

[[5070  568]
 [ 711 4878]]
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      5638
           1       0.90      0.87      0.88      5589

    accuracy                           0.89     11227
   macro avg       0.89      0.89      0.89     11227
weighted avg       0.89      0.89      0.89     11227



In [8]:
final = pd.read_csv('final.csv')
prediction_counts = final['prediction'].value_counts()

print("Number of predictions of 0:", prediction_counts[0])
print("Number of predictions of 1:", prediction_counts[1])
print(f'Portion of positive over negative sentiments: {prediction_counts[1] / (prediction_counts[0] + prediction_counts[1])}')

predictions_1 = final[final['prediction'] == 1]['Tweet']

Number of predictions of 0: 18946
Number of predictions of 1: 18476
Portion of positive over negative sentiments: 0.49372027149804926


In [9]:
unique_dates = final['Date'].str[:10].unique()
dates_df = pd.DataFrame({'Date': unique_dates})

positive_counts = []
negative_counts = []
positive_percentages = []

for date in unique_dates:
    
    positive_count = len(final[(final['Date'].str[:10] == date) & (final['prediction'] == 1)])
    negative_count = len(final[(final['Date'].str[:10] == date) & (final['prediction'] == 0)])
    positive_counts.append(positive_count)
    negative_counts.append(negative_count)

    total_count = positive_count + negative_count
    positive_percentage = (positive_count / total_count) * 100 if total_count != 0 else 0

    positive_percentages.append(positive_percentage)

dates_df['positive_count'] = positive_counts
dates_df['negative_count'] = negative_counts
dates_df['positive_percentage'] = positive_percentages

dates_df.to_csv('dates.csv', index=False)

In [10]:
import pandas as pd

def final_analysis(stock):

    stonks = pd.read_csv(f'stock-data/{stock}-historical-data.csv')
    stonks['DATE'] = stonks['DATE'].str[:10]

    merged_df = stonks.merge(dates_df, left_on='DATE', right_on='Date', how='left')
    stonks['positive_percentage'] = merged_df['positive_percentage']

    merged_df.to_csv('merged.csv', index=False)
    print(f'Historical data has been merged with positive sentiment for ${stock}')

    # monthly averages for positive percentages:

    merged_df['Date'] = pd.to_datetime(merged_df['Date'])
    merged_df['month'] = merged_df['Date'].dt.to_period('M')
    monthly_avg = merged_df.groupby('month')['positive_percentage'].mean()
    monthly_avg_df = pd.DataFrame({'month': monthly_avg.index, 'mean_pos_percentage': monthly_avg.values})
    monthly_avg_df.set_index('month', inplace=True)

    monthly_avg_df.to_csv('monthly_avgs.csv', index=True)

    # weekly averages for positive percentages:

    merged_df['Date'] = pd.to_datetime(merged_df['Date'])
    merged_df['week'] = merged_df['Date'].dt.to_period('W')
    weekly_avg = merged_df.groupby('week')['positive_percentage'].mean()
    weekly_avg_df = pd.DataFrame({'week': weekly_avg.index, 'mean_pos_percentage': weekly_avg.values})
    weekly_avg_df.set_index('week', inplace=True)

    weekly_avg_df.to_csv('weekly_avgs.csv', index=True)

    # yearly averages for positive percentages:

    merged_df['Date'] = pd.to_datetime(merged_df['Date'])
    merged_df['year'] = merged_df['Date'].dt.to_period('Y')
    yearly_avg = merged_df.groupby('year')['positive_percentage'].mean()
    yearly_avg_df = pd.DataFrame({'year': yearly_avg.index, 'mean_pos_percentage': yearly_avg.values})
    yearly_avg_df.set_index('year', inplace=True)

    yearly_avg_df.to_csv('yearly_avgs.csv', index=True)

    # how many are correct on the daily timeframe:

    merged_df['Date'] = pd.to_datetime(merged_df['Date'])
    merged_df['day'] = merged_df['Date'].dt.to_period('D')
    daily_avg = merged_df.groupby('day')['positive_percentage'].mean()
    daily_avg_df = pd.DataFrame({'day': daily_avg.index, 'mean_pos_percentage': daily_avg.values})
    daily_avg_df.set_index('day', inplace=True)

    daily_avg_df.to_csv('daily_avgs.csv', index=True)

    merged_df['final_decision'] = float('nan') # add another row for final decisions

    # get final decisions:

    for index, row in merged_df.iterrows():
        if row['% CHANGE'] >= 0 and row['positive_percentage'] > 50:
            merged_df.loc[index, 'final_decision'] = 1
        elif row['% CHANGE'] < 0 and row['positive_percentage'] < 50:
            merged_df.loc[index, 'final_decision'] = 1
        else:
            merged_df.loc[index, 'final_decision'] = 0

    merged_df.to_csv(f'final_values_{stock}.csv', index=True)
    counts = merged_df['final_decision'].value_counts()

    final_df = pd.read_csv(f'final_values_{stock}.csv')
    final_df.dropna(subset=['Date'], inplace=True) # drop all empty values to get meaningful counts
    counts = final_df['final_decision'].value_counts()

    print(f'\n---\nOut of {counts[0] + counts[1]} final decisions for ${stock}, {round( ( counts[1] / (counts[0] + counts[1])) * 100, 2)}% were correct.\n---')


In [11]:
for stock in ['TSLA', 'TSM', 'AAPL']:
    final_analysis(stock)
    

Historical data has been merged with positive sentiment for $TSLA

---
Out of 252 final decisions for $TSLA, 47.62% were correct.
---
Historical data has been merged with positive sentiment for $TSM

---
Out of 252 final decisions for $TSM, 47.62% were correct.
---
Historical data has been merged with positive sentiment for $AAPL

---
Out of 252 final decisions for $AAPL, 48.41% were correct.
---
