
# Calculate daily sentiment scores with tweet metrics considered



WARNING: THIS NOTEBOOK TOOK ABOUT 2 HOURS TO RUN! You might want to skip running this and use the generated dataset to do analysis in the next notebook. <br>
<br>
It takes a csv with tweets abut Bitcoin as an input. <br>
Then it calculates sentiment for each tweet in the file, taking also into account the likes, retweets and replies each of the tweet has. <br>
It generates a dataset of daily sentiment scores: "date_and_score.csv". 

## Import packages

In [None]:
import re
import datetime
import time
import pandas as pd
import csv
import nltk
import nltk.corpus
import numpy as np
from nltk.corpus import stopwords
from statistics import mean
from nltk.stem.wordnet import WordNetLemmatizer
from textblob import TextBlob
nltk.download('vader_lexicon')

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lem = WordNetLemmatizer()

from nltk.sentiment import SentimentIntensityAnalyzer

## Define helper functions

In [None]:
sia = SentimentIntensityAnalyzer()
analyzed_tweets = 0
cleaned_tweets = 0

def find_sentiment(text):
    global sia
    global analyzed_tweets
    print(" "*30, end = "\r")
    print("Analyzing tweet:", analyzed_tweets + 1, end = "\r")
    sentiment = sia.polarity_scores(text)
    analyzed_tweets += 1
    return [sentiment["neg"], sentiment["neu"], sentiment["pos"], sentiment["compound"]]


def consider_tweet_metrics(text, replies, likes, retweets):  #maybe should also consider favorites
  sentiment = find_sentiment(text)
  score = sentiment[-1]
  subjectivity = find_subjectivity(text) # [0, 1], whereas 0 means very factual and objective and 1 means highly subjective opinion
  replies += 1
  likes += 1
  retweets += 1
  return sentiment + [score * replies * likes * retweets * (1 - subjectivity**4)]

def clean_text(text):
    global cleaned_tweets
    print(" "*30, end = "\r")
    print("CLeaning tweet:", cleaned_tweets + 1, end = "\r")
    text = text.lower()
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text) #remove punctuation, URL and @

    #stop = stopwords.words('english')
    stop = nltk.corpus.stopwords.words(['english'])
    words = text.split()
    can_use = []
    for word in words:
      if word not in stop:
        can_use.append(word)
    lemmatised = [lem.lemmatize(t) for t in can_use]

    cleaned_tweets += 1
    return " ".join(lemmatised)

def find_subjectivity(text):
  return TextBlob(text).sentiment.subjectivity

## Register start time for timing purposes

In [None]:
script_start = time.time()
print(datetime.datetime.fromtimestamp(script_start).strftime("%Y-%m-%d %H:%M:%S"))

## Read the csv containing tweets

In [None]:
f = "tweets.csv"
nrows= 20843765
sep = ";"
start_date = "2016-01-01"
end_date = "2019-11-22"

df_full = pd.read_csv(f, nrows=nrows, sep=sep) # specifying the nrows parameter seems to help avoid running out of memory when reading the csv
print(df_full.shape)

## Force data types and clean the data

In [None]:
df_full["replies"] = pd.to_numeric(df_full["replies"], errors='coerce')
df_full["likes"] = pd.to_numeric(df_full["likes"], errors='coerce')
df_full["retweets"] = pd.to_numeric(df_full["retweets"], errors='coerce')

df_full = df_full.dropna(subset = ['timestamp','text']) #instances without date or text are useless, if necessary uncheck the ['test']
date_filter="2016-|2017-|2018-|2019-"
df_full = df_full[df_full['timestamp'].str.contains(date_filter) == True] #to assert the correctness of date
df_full = df_full.sort_values(by = 'timestamp')

df_full['timestamp'] = df_full['timestamp'].apply(lambda x: x[:10]) # Only leave the first 10 characters of date field to discard time of the day and only leave date


df=df_full
#df = df_full.sample(frac = 0.001, replace = False, random_state = 1)
print(df.shape)

## Clean the text and run sentiment analysis on all the tweets

In [None]:
df['text'] = df.apply(lambda row : clean_text(row['text']), axis = 1)  #clean the text first
#df['score'] = df.apply(lambda row : consider_tweet_metrics(row['text'], row['replies'], row['likes'], row['retweets']), axis = 1) #find the score for each left istance
df['Sentiment_data'] = df.apply(lambda row : consider_tweet_metrics(row['text'], row['replies'], row['likes'], row['retweets']), axis = 1) #find the score for each left istance

## Aggregate sentiment analysis results to daily bins

In [None]:
dates = pd.date_range(start = start_date, end = end_date) #so that all the dates are definitely filled (to avoid gaps while merging with the price dataframe)
dates = dates.strftime("%Y-%m-%d")

avg_sentiment = []
tweets_volume = []
neg_sent_proportion = []
neu_sent_proportion = []
pos_sent_proportion = []
avg_replies = []
avg_likes = []
avg_retweets = []
scores = []
for date in dates:
    #s_date = date.strftime('%Y-%m-%d')
    current_dates = df[df["timestamp"] == date]
    avg_sentiment.append(current_dates["Sentiment_data"].str.get(3).mean())
    avg_replies.append(current_dates["replies"].mean())
    avg_likes.append(current_dates["likes"].mean())
    avg_retweets.append(current_dates["retweets"].mean())
    volume = current_dates.shape[0]
    tweets_volume.append(volume)
    neg_sent_proportion.append(current_dates["Sentiment_data"].str.get(0).mean())
    neu_sent_proportion.append(current_dates["Sentiment_data"].str.get(1).mean())
    pos_sent_proportion.append(current_dates["Sentiment_data"].str.get(2).mean())
    scores.append(current_dates["Sentiment_data"].str.get(3).mean())

df_final = pd.DataFrame()
df_final["Date"] = dates
df_final["Avg_sentiment"] = avg_sentiment
df_final["Tweets_volume"] = tweets_volume
df_final["Neg_sent_proportion"] = neg_sent_proportion
df_final["Neu_sent_proportion"] = neu_sent_proportion
df_final["Pos_sent_proportion"] = pos_sent_proportion
df_final["Avg_replies"] = avg_replies
df_final["Avg_likes"] = avg_likes
df_final["Avg_retweets"] = avg_retweets
df_final["score"] = np.array(scores)/np.nanmax(np.abs(scores))

## Calculate moving averages of sentiment data to get trends

In [None]:
### CALCULATE MOVING AVERAGES, THESE MIGHT OR MIGHT NOT BE USED IN TTHE FINAL PREDICTION MODEL

labels = ["Avg_sentiment", "Tweets_volume", "Neg_sent_proportion", "Neu_sent_proportion", "Pos_sent_proportion", "Avg_replies", "Avg_likes", "Avg_retweets", "score"]
ma_periods = [2, 7, 21]

for period in ma_periods:
        current_df = df_final[labels]
        moving_averages = current_df.rolling(period, min_periods=period).mean()
        trends = pd.DataFrame()
        for label in labels:
          trends[f"{period}_ma_trend_{label}"] = current_df[label]/moving_averages[label] - 1
        trends.dropna(axis=1, how='all')
        for label in trends.columns:
          if trends[label].describe()["count"] > 0:
            df_final[label] = trends[label]

## Write results to file

In [None]:
df_final.to_csv('date_and_score.csv', index=False)
print("Total runtime: ", time.time() - script_start)