In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
# Import data generated from main.ipynb
amazon_df = pd.read_csv("data/sent-amazon.csv")
apple_df = pd.read_csv("data/sent-apple.csv")
google_df = pd.read_csv("data/sent-google.csv")
microsoft_df = pd.read_csv("data/sent-microsoft.csv")
tesla_df = pd.read_csv("data/sent-tesla.csv")


In [21]:
"""
Vader Sentiment
Positive sentiment: > 0.05
Negative sentiment: < -0.05
Neutral sentiment: > 0.05 AND < -0.05 
"""
# Add Vader sentiment label (eg. positive, negative, or neutral)
def sent_label(row):
    if row["Sentiment"] > 0.05:
        return "Positive" 
    elif row["Sentiment"] < -0.05:
        return "Negative"
    else:
        return "Neutral"

amazon_df["sentiment_label"] = amazon_df.apply(sent_label, axis = 1)
apple_df["sentiment_label"] = apple_df.apply(sent_label, axis = 1)
google_df["sentiment_label"] = google_df.apply(sent_label, axis = 1)
microsoft_df["sentiment_label"] = microsoft_df.apply(sent_label, axis = 1)
tesla_df["sentiment_label"] = tesla_df.apply(sent_label, axis = 1)


In [22]:
# Function to convert datetime format to datetime64 (date only) and pivot table to counts

def date_and_sentiment(df):
    #convert date format from unicode to datetime64, dropping the time
    if pd.api.types.is_numeric_dtype(df["post_date"]):
        df['date'] = pd.to_datetime(df["post_date"], unit='s')
        df['date'] = df['date'].dt.floor('D')
    else:
        df['post_date'] = pd.to_numeric(df["post_date"], errors='coerce')
        df['date'] = pd.to_datetime(df["post_date"], unit='s', errors='coerce')
        df['date'] = df['date'].dt.floor('D')

    #group and pivot to get the count values
    group_df = df.groupby('date')['sentiment_label'].value_counts().reset_index(name='count')
    pivot_df = group_df.pivot(index='date', columns='sentiment_label', values='count').fillna(0)
    pivoted = pivot_df.astype(int)
    
    return pivoted


### Generate Preprocessed CSVs for Sentiment Counts
###### Note: The 'Negative' column values were updated to negative integers count for plotting purpose 

In [23]:
# Amazon
#preprocessed apple csv output

amazon_df = date_and_sentiment(amazon_df)
amazon_df['Negative'] = -amazon_df['Negative']
amazon_df["Company"] = "Amazon"
amazon_df.to_csv('./data/amzn_sent_count.csv', index=False)
amazon_df.head(2)

sentiment_label,Negative,Neutral,Positive,Company
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,-61,28,42,Amazon
2015-01-02,-34,64,76,Amazon


In [24]:
# Apple 
#preprocessed apple csv output
apple_df = date_and_sentiment(apple_df)
apple_df['Negative'] = -apple_df['Negative']
apple_df["Company"] = "Apple"
apple_df.to_csv('./data/aapl_sent_count.csv', index=False)
apple_df.head(2)

sentiment_label,Negative,Neutral,Positive,Company
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,-42,138,119,Apple
2015-01-02,-163,399,305,Apple


In [25]:
# Google 
#preprocessed combined google csv output

google_df = date_and_sentiment(google_df)
google_df['Negative'] = -google_df['Negative']
google_df["Company"] = "Google"
google_df.to_csv('./data/google_sent_count.csv', index=False)
google_df.head(2)

sentiment_label,Negative,Neutral,Positive,Company
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,-8,46,51,Google
2015-01-02,-31,112,113,Google


In [26]:
# Microsoft
#preprocessed microsoft csv output

microsoft_df = date_and_sentiment(microsoft_df)
microsoft_df['Negative'] = -microsoft_df['Negative']
microsoft_df["Company"] = "Microsoft"
microsoft_df.to_csv('./data/msft_sent_count.csv', index=False)
microsoft_df.head(2)

sentiment_label,Negative,Neutral,Positive,Company
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,-8,23,23,Microsoft
2015-01-02,-17,39,51,Microsoft


In [27]:
# Tesla
#preprocessed Tesla csv output

tesla_df = date_and_sentiment(tesla_df)
tesla_df["Company"] = "Tesla"
tesla_df['Negative'] = -tesla_df['Negative']
tesla_df.to_csv('./data/tsla_sent_count.csv', index=False)
tesla_df.head(2)

sentiment_label,Negative,Neutral,Positive,Company
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,-22,38,39,Tesla
2015-01-02,-31,85,85,Tesla


In [28]:
# all sentiments combined

combined_counts_df = pd.concat([amazon_df, apple_df, google_df, microsoft_df, tesla_df], axis=0)
combined_counts_df.to_csv('./data/combined_sent_count.csv', index=False)


In [29]:
combined_counts_df

sentiment_label,Negative,Neutral,Positive,Company
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,-61,28,42,Amazon
2015-01-02,-34,64,76,Amazon
2015-01-03,-8,31,28,Amazon
2015-01-04,-10,46,29,Amazon
2015-01-05,-29,132,120,Amazon
...,...,...,...,...
2019-12-27,-300,449,556,Tesla
2019-12-28,-139,168,318,Tesla
2019-12-29,-174,168,291,Tesla
2019-12-30,-353,509,555,Tesla
