In [18]:
pip install nltk scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp39-cp39-win_amd64.whl (9.3 MB)
     ---------------------------------------- 9.3/9.3 MB 19.2 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.3.0 threadpoolctl-3.2.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\yemia\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [22]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load the data
data = pd.read_csv('sent_train.csv')
data.head()


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [7]:
# Install the stopwords resource
nltk.download('stopwords')

# Data cleaning
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\$[A-Za-z]+', '', text)  # Remove stock symbols
    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-word characters and convert to lowercase
    return text.strip()

data['cleaned_text'] = data['text'].apply(clean_text)

# Remove stop words and perform stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

data['processed_text'] = data['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop_words]))

# Now, 'processed_text' column contains the preprocessed text data
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yemia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Unnamed: 0,text,label,cleaned_text,processed_text
0,$BYND - JPMorgan reels in expectations on Beyo...,0,jpmorgan reels in expectations on beyond meat,jpmorgan reel expect beyond meat
1,$CCL $RCL - Nomura points to bookings weakness...,0,nomura points to bookings weakness at carnival...,nomura point book weak carniv royal caribbean
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0,cemex cut at credit suisse j p morgan on weak ...,cemex cut credit suiss j p morgan weak build o...
3,$ESS: BTIG Research cuts to Neutral https://t....,0,btig research cuts to neutral,btig research cut neutral
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0,funko slides after piper jaffray pt cut,funko slide piper jaffray pt cut


In [20]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Choose the maximum number of features
X = tfidf_vectorizer.fit_transform(data['processed_text']).toarray()
y = data['label']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model training
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
# Model prediction
y_pred = naive_bayes.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.74
              precision    recall  f1-score   support

           0       0.89      0.18      0.29       285
           1       0.75      0.38      0.50       391
           2       0.73      0.98      0.84      1233

    accuracy                           0.74      1909
   macro avg       0.79      0.51      0.55      1909
weighted avg       0.76      0.74      0.69      1909



# Evaluating Validation Data

In [24]:
# Load the data
val_data = pd.read_csv('sent_valid.csv')
val_data.head()

Unnamed: 0,text,label
0,$ALLY - Ally Financial pulls outlook https://t...,0
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0
2,$PRTY - Moody's turns negative on Party City h...,0
3,$SAN: Deutsche Bank cuts to Hold,0
4,$SITC: Compass Point cuts to Sell,0


In [25]:
val_data['cleaned_text'] = val_data['text'].apply(clean_text)
val_data['processed_text'] = val_data['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split() if word not in stop_words]))

# Now, 'processed_text' column contains the preprocessed text data
val_data.head()

Unnamed: 0,text,label,cleaned_text,processed_text
0,$ALLY - Ally Financial pulls outlook https://t...,0,ally financial pulls outlook,alli financi pull outlook
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0,dell hpe targets trimmed on compute headwinds,dell hpe target trim comput headwind
2,$PRTY - Moody's turns negative on Party City h...,0,moody s turns negative on party city,moodi turn neg parti citi
3,$SAN: Deutsche Bank cuts to Hold,0,deutsche bank cuts to hold,deutsch bank cut hold
4,$SITC: Compass Point cuts to Sell,0,compass point cuts to sell,compass point cut sell


In [26]:
## Evaluating Validation Data

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Choose the maximum number of features
val_X = tfidf_vectorizer.fit_transform(val_data['processed_text']).toarray()
val_y = val_data['label']

# Model prediction
val_y_pred = naive_bayes.predict(val_X)
# Model evaluation
val_accuracy = accuracy_score(val_y, val_y_pred)
print(f"Accuracy: {val_accuracy:.2f}")
print(classification_report(val_y, val_y_pred))

Accuracy: 0.61
              precision    recall  f1-score   support

           0       0.15      0.04      0.07       347
           1       0.27      0.11      0.16       475
           2       0.66      0.88      0.76      1566

    accuracy                           0.61      2388
   macro avg       0.36      0.35      0.33      2388
weighted avg       0.51      0.61      0.54      2388

