In [12]:
pip install nltk scikit-learn transformers

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\yemia\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [2]:
#Install torch in terminal copy code below. after installation restart notebook.
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import  accuracy_score, classification_report
# Install the stopwords resource
nltk.download('stopwords')
# Download the punkt tokenizer
#nltk.download('punkt')

# Load the data
data = pd.read_csv('sent_train.csv')

# Data cleaning
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\$[A-Za-z]+', '', text)  # Remove stock symbols
    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-word characters and convert to lowercase
    return text.strip()

data['cleaned_text'] = data['text'].apply(clean_text)

# Remove stop words and perform stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

data['processed_text'] = data['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop_words]))

# Now, 'processed_text' column contains the preprocessed text data

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data using BERT tokenizer
inputs = tokenizer(data['processed_text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Extract BERT embeddings
with torch.no_grad():
    outputs = model(**inputs)
    bert_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

# Prepare target variable
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, y, test_size=0.2, random_state=42)

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0)

# Train the classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yemia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading model.safetensors: 100%|██████████| 440M/440M [00:13<00:00, 33.7MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


              precision    recall  f1-score   support

           0       0.46      0.41      0.43       285
           1       0.55      0.43      0.49       391
           2       0.79      0.87      0.83      1233

    accuracy                           0.71      1909
   macro avg       0.60      0.57      0.58      1909
weighted avg       0.69      0.71      0.70      1909



# Evaluating Validation Data

In [3]:
# Load the data
val_data = pd.read_csv('sent_valid.csv')
val_data.head()

Unnamed: 0,text,label
0,$ALLY - Ally Financial pulls outlook https://t...,0
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0
2,$PRTY - Moody's turns negative on Party City h...,0
3,$SAN: Deutsche Bank cuts to Hold,0
4,$SITC: Compass Point cuts to Sell,0


In [4]:
val_data['cleaned_text'] = val_data['text'].apply(clean_text)
val_data['processed_text'] = val_data['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop_words]))

# Tokenize the preprocessed text (sentences)
#val_data['tokenized_text'] = val_data['processed_text'].apply(word_tokenize)

val_data.head()

Unnamed: 0,text,label,cleaned_text,processed_text
0,$ALLY - Ally Financial pulls outlook https://t...,0,ally financial pulls outlook,alli financi pull outlook
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0,dell hpe targets trimmed on compute headwinds,dell hpe target trim comput headwind
2,$PRTY - Moody's turns negative on Party City h...,0,moody s turns negative on party city,moodi turn neg parti citi
3,$SAN: Deutsche Bank cuts to Hold,0,deutsche bank cuts to hold,deutsch bank cut hold
4,$SITC: Compass Point cuts to Sell,0,compass point cuts to sell,compass point cut sell


In [6]:
# Tokenize and encode the text data using BERT tokenizer
val_inputs = tokenizer(val_data['processed_text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Extract BERT embeddings
with torch.no_grad():
    outputs = model(**val_inputs)
    val_bert_embeddings = outputs.last_hidden_state[:, 0, :].numpy()

# Prepare target variable
val_y = val_data['label']


# Model prediction on val data
val_y_pred = svm_classifier.predict(val_bert_embeddings)
# Model evaluation
val_accuracy = accuracy_score(val_y, val_y_pred)
print(f"Accuracy: {val_accuracy:.2f}")
print(classification_report(val_y, val_y_pred))

              precision    recall  f1-score   support

           0       0.46      0.41      0.44       347
           1       0.57      0.49      0.53       475
           2       0.80      0.86      0.83      1566

    accuracy                           0.72      2388
   macro avg       0.61      0.59      0.60      2388
weighted avg       0.71      0.72      0.71      2388

