In [60]:
!pip install wget --quiet
!pip install sentence-transformers --quiet
!pip install scikit-learn --quiet
import wget
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import numpy as np

In [63]:
csv_file_path = "https://raw.githubusercontent.com/trishakarma/stocks-sentiment-analysis/refs/heads/main/tweets_labelled_09042020_16072020.csv"
file_path = "tweets_labelled_09042020_16072020.csv"

if not os.path.exists(file_path):
    wget.download(csv_file_path, file_path)
    print("File Downloaded")
else:
    print("File already exists in directory")

File already exists in directory


In [64]:
df = pd.read_csv(file_path,delimiter=";", usecols=["text", "sentiment"])
df = df.dropna(subset=["text", "sentiment"]).reset_index(drop=True)

df.head()

Unnamed: 0,text,sentiment
0,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive
1,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative
2,Net issuance increases to fund fiscal programs...,positive
3,RT @bentboolean: How much of Amazon's traffic ...,positive
4,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive


In [65]:
#Create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].tolist(), convert_to_numpy=True, normalize_embeddings=True)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

In [66]:
#Training
X_train, X_test, y_train, y_test = train_test_split(embeddings, df['sentiment'], test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print("Model performance on held-out test set:")
print(classification_report(y_test, clf.predict(X_test)))
joblib.dump(clf, "sentiment_model.pkl")

Model performance on held-out test set:
              precision    recall  f1-score   support

    negative       0.64      0.44      0.52        78
     neutral       0.61      0.56      0.58        88
    positive       0.52      0.70      0.60        94

    accuracy                           0.57       260
   macro avg       0.59      0.56      0.57       260
weighted avg       0.59      0.57      0.57       260



['sentiment_model.pkl']

In [67]:
predicted_sentiments = clf.predict(embeddings)
s2_df = pd.DataFrame({
    "text": df['text'],
    "sentiment": predicted_sentiments,
    "embedding": [np.array(e).astype(np.float32) for e in embeddings]
})

In [70]:
%%sql
DROP DATABASE IF EXISTS sentiments;
CREATE DATABASE IF NOT EXISTS sentiments;

In [71]:
%%sql
USE sentiments;
DROP TABLE IF EXISTS tweets;
CREATE TABLE IF NOT EXISTS tweets (
    text TEXT,
    sentiment TEXT,
    embedding VECTOR(384) NOT NULL
);
    

In [72]:
import sqlalchemy
from sqlalchemy import create_engine

db_connection = create_engine(connection_url)  

s2_df.to_sql(
    "tweets", 
    con=db_connection, 
    schema="sentiments",
    if_exists="append", 
    index=False, 
    chunksize=1000
)

1300