In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib


In [2]:
# Download NLTK resources (only needs to be done once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vallirajasekar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vallirajasekar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vallirajasekar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('/Users/vallirajasekar/Desktop/NLP_Challenge/Financial Sentiment Analysis/data.csv')


In [5]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [6]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [8]:
# Apply preprocessing to the 'text' column
df['clean_text'] = df['Sentence'].apply(preprocess_text)


In [10]:
df['Sentiment'].value_counts()

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64

In [12]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['Sentiment']

In [13]:
# Step 4: Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Step 5: Model Training
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [15]:
accuracy = clf.score(X_test, y_test)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.679213002566296


In [17]:
from sklearn.ensemble import RandomForestClassifier


In [22]:
clf = RandomForestClassifier(n_estimators=150, random_state=42)
clf.fit(X_train, y_train)


RandomForestClassifier(n_estimators=150, random_state=42)

In [23]:
accuracy = clf.score(X_test, y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.6484174508126604


In [24]:
from sklearn.svm import SVC


In [31]:
# Step 5: Model Training (SVM)
clf = SVC(kernel='linear', random_state=42,tol=0.0001)
clf.fit(X_train, y_train)


SVC(kernel='linear', random_state=42, tol=0.0001)

In [32]:
# Step 6: Model Evaluation
accuracy = clf.score(X_test, y_test)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.7108639863130881


In [33]:
# Step 7: Save the trained model using joblib
joblib.dump(clf, 'svm_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
