In [14]:
pip install gspread pandas oauth2client google-api-python-client google-auth-httplib2 google-auth-oauthlib

Collecting google-api-python-client
  Downloading google_api_python_client-2.125.0-py2.py3-none-any.whl (12.5 MB)
                                              0.0/12.5 MB ? eta -:--:--
                                              0.1/12.5 MB 2.6 MB/s eta 0:00:05
     --                                       0.7/12.5 MB 9.3 MB/s eta 0:00:02
     -----                                    1.6/12.5 MB 12.9 MB/s eta 0:00:01
     --------                                 2.5/12.5 MB 14.5 MB/s eta 0:00:01
     ----------                               3.3/12.5 MB 15.3 MB/s eta 0:00:01
     -------------                            4.2/12.5 MB 15.8 MB/s eta 0:00:01
     ----------------                         5.0/12.5 MB 16.1 MB/s eta 0:00:01
     ------------------                       5.9/12.5 MB 16.3 MB/s eta 0:00:01
     ---------------------                    6.7/12.5 MB 16.6 MB/s eta 0:00:01
     ------------------------                 7.5/12.5 MB 17.2 MB/s eta 0:00:01
     -----------

In [29]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.nn.functional import softmax
import torch

In [30]:
# define scope and obtain credentials
scope = ["https://spreadsheets.google.com/feeds",'https://www.googleapis.com/auth/spreadsheets',
         "https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive"]

# input json file
creds = ServiceAccountCredentials.from_json_keyfile_name('', scope)

# authentication
client = gspread.authorize(creds)

In [31]:
# Extract all stock tweet csvs
service = build('drive', 'v3', credentials=creds)
query = "mimeType='application/vnd.google-apps.spreadsheet' and name = 'stock_tweets'"

results = service.files().list(q=query,
                               spaces='drive',
                               fields='nextPageToken, files(id, name)').execute()

items = results.get('files', [])
all_data_df = pd.DataFrame()

if not items:
    print('No files found.')
else:
    for item in items:
        sh = client.open_by_key(item['id'])
        worksheet = sh.get_worksheet(0) # First worksheet
        data = worksheet.get_all_values() # Get values
    
        # Store in df
        df = pd.DataFrame(data)
        df.columns = df.iloc[0]
        df = df.iloc[1:].reset_index(drop=True)
        all_data_df = pd.concat([all_data_df, df], ignore_index=True)

all_data_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [32]:
# Import BERT model
model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)

cuda


In [33]:
all_data_df['Sentiment'] = all_data_df['Tweet'].apply(lambda x: sentiment_pipeline(x))
all_data_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Sentiment
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9954950809478..."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9998196959495..."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9992457628250..."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.5450521707534..."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9839450716972..."


In [35]:
# Get the sentiment
def extract_label(row):
    return row[0]['label']

all_data_df[['sentiment_label']] = all_data_df['Sentiment'].apply(lambda x: pd.Series(extract_label(x)))
all_data_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Sentiment,sentiment_label
0,Sep-29-22 23:41,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9954950809478...",neutral
1,Sep-29-22 23:24,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9998196959495...",neutral
2,Sep-29-22 23:18,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9992457628250...",neutral
3,Sep-29-22 22:40,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.5450521707534...",neutral
4,Sep-29-22 22:27,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9839450716972...",neutral


In [34]:
# Standardise Date
all_data_df['Date'] = pd.to_datetime(all_data_df['Date'])
all_data_df['Date'] = all_data_df['Date'].dt.strftime('%b-%d-%y %H:%M')
all_data_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Sentiment
0,Sep-29-22 23:41,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9954950809478..."
1,Sep-29-22 23:24,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9998196959495..."
2,Sep-29-22 23:18,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9992457628250..."
3,Sep-29-22 22:40,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.5450521707534..."
4,Sep-29-22 22:27,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9839450716972..."


In [36]:
def extract_label(row):
    return row[0]['label']

all_data_df[['sentiment_label']] = all_data_df['Sentiment'].apply(lambda x: pd.Series(extract_label(x)))
all_data_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Sentiment,sentiment_label
0,Sep-29-22 23:41,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9954950809478...",neutral
1,Sep-29-22 23:24,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9998196959495...",neutral
2,Sep-29-22 23:18,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9992457628250...",neutral
3,Sep-29-22 22:40,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.5450521707534...",neutral
4,Sep-29-22 22:27,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.","[{'label': 'neutral', 'score': 0.9839450716972...",neutral


In [42]:
all_data_df.to_csv('all_data.csv', index=False)