1.	Download the Kaggle Financial Sentiment Data

In [None]:
# s1: data collection - kaggle financial sentiment data

# the dataset 'data.csv' is already downloaded using the professor's link,
# and it's uploaded to google drive in the same folder as the jupyter notebook.
# let's load it and check the first few rows to make sure it's ready for use.

import pandas as pd

# load the dataset from google drive
file_path = '/content/drive/MyDrive/HW02/data.csv'  # update this path if necessary
df = pd.read_csv(file_path)

# display the first few rows to verify the data is loaded properly
df.head()  # quick check to make sure we're good to go


In [1]:
from google.colab import drive
drive.mount('/content/drive')


ModuleNotFoundError: No module named 'google'

2. Use a randomized sample of 80% data for training, and the rest 20% for testing

In [None]:
# s2: split the data into training and testing sets (80% training, 20% testing)

from sklearn.model_selection import train_test_split

# X is the input data (the sentences) and y is the target labels (sentiments)
X = df['Sentence']  # sentences (input)
y = df['Sentiment']  # sentiment labels (output)

# split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# check the size of the splits to ensure it's correct
print(f"Training data size: {len(X_train)}")
print(f"Testing data size: {len(X_test)}")


3.Build a linearSVC classifier using unigrams. You can decide on the other vectorization options.

a.	Report the top 20 positive features and negative features.

b.	Report the f1 and accuracy results.

c.	Examine up to 25 FP and FN errors and report linguistic patterns.


In [None]:
# s3: build a LinearSVC classifier using unigrams

# load necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# define file path for the dataset in google drive
file_path = '/content/drive/MyDrive/HW02/data.csv'  # file is already accessible

# load the dataset
df = pd.read_csv(file_path)

# separate features (X) and labels (y)
X = df['Sentence']  # input text
y = df['Sentiment']  # sentiment labels

# split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# convert text into numerical format using tf-idf (unigrams only)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))  # unigrams only
X_train_tfidf = vectorizer.fit_transform(X_train)  # fit and transform training data
X_test_tfidf = vectorizer.transform(X_test)  # transform test data (same vectorizer)

# train the LinearSVC model
svc = LinearSVC(random_state=42, dual=False)  # linear support vector classifier, dual=False for efficiency
svc.fit(X_train_tfidf, y_train)  # train on labeled data

# extract feature names
feature_names = np.array(vectorizer.get_feature_names_out())  # get all feature names

# get the mean absolute value of the coefficients (for feature importance)
coefficients = np.mean(np.abs(svc.coef_), axis=0)

# get the top 20 features that contribute most to positive/negative classification
top_positive_idx = coefficients.argsort()[-20:][::-1]  # highest positive weights
top_negative_idx = coefficients.argsort()[:20]  # lowest (most negative) weights

# get the actual words
top_positive_features = feature_names[top_positive_idx]
top_negative_features = feature_names[top_negative_idx]

# s3a: visualize top 20 positive & negative features (black/gray bars)
plt.figure(figsize=(12, 6))

# positive features bar chart (black bars)
plt.subplot(1, 2, 1)
plt.barh(top_positive_features[::-1], coefficients[top_positive_idx][::-1], color='black')
plt.xlabel("importance score")
plt.title("top 20 positive features")
plt.gca().invert_yaxis()

# negative features bar chart (gray bars)
plt.subplot(1, 2, 2)
plt.barh(top_negative_features[::-1], coefficients[top_negative_idx][::-1], color='gray')
plt.xlabel("importance score")
plt.title("top 20 negative features")
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()

# s3b: model performance (using a table, no visualization)
y_pred = svc.predict(X_test_tfidf)  # make predictions on test data

# compute accuracy and f1 score
accuracy = accuracy_score(y_test, y_pred)  # percentage of correct predictions
report = classification_report(y_test, y_pred, output_dict=True)  # full classification report
f1_score = report['accuracy']  # extract accuracy score

# display model performance as a table
evaluation_df = pd.DataFrame({
    "metric": ["accuracy", "f1 score"],
    "score": [accuracy, f1_score]
})
print("\nmodel performance metrics:")
print(evaluation_df)

# s3c: analyze false positives (FP) and false negatives (FN)
false_positives = []  # predicted positive, but should not be
false_negatives = []  # predicted negative, but should not be

# iterate through test data and collect errors
for i in range(len(y_test)):
    if y_pred[i] == 'positive' and y_test.iloc[i] != 'positive':
        false_positives.append((X_test.iloc[i], y_test.iloc[i], y_pred[i]))
    elif y_pred[i] == 'negative' and y_test.iloc[i] != 'negative':
        false_negatives.append((X_test.iloc[i], y_test.iloc[i], y_pred[i]))

# limit to first 25 examples of each
num_fp_display = min(25, len(false_positives))
num_fn_display = min(25, len(false_negatives))

# create dataframe for fp & fn errors
fp_fn_df = pd.DataFrame({
    "sentence": [fp[0] for fp in false_positives[:num_fp_display]] + [fn[0] for fn in false_negatives[:num_fn_display]],
    "true label": [fp[1] for fp in false_positives[:num_fp_display]] + [fn[1] for fn in false_negatives[:num_fn_display]],
    "predicted label": [fp[2] for fp in false_positives[:num_fp_display]] + [fn[2] for fn in false_negatives[:num_fn_display]]
})

print("\nfalse positives & false negatives:")
print(fp_fn_df)


# s4: build a logistic regression classifier using fasttext embeddings

In [None]:
# s1: install fasttext, load dataset, and initialize embeddings

# install fasttext (if not already installed)
!pip install fasttext

# load necessary libraries
import pandas as pd
import numpy as np
import fasttext.util  # for loading pre-trained fasttext embeddings
from sklearn.model_selection import train_test_split

# define file path for the dataset in google drive
file_path = '/content/drive/MyDrive/HW02/data.csv'  # file is already accessible

# load dataset
df = pd.read_csv(file_path)

# separate features (X) and labels (y)
X = df['Sentence']  # input text, raw sentences
y = df['Sentiment']  # sentiment labels (positive, negative, neutral)

# split data into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# download and load pre-trained fasttext embeddings (english, 300-dimensional vectors)
fasttext.util.download_model('en', if_exists='ignore')  # download only if not already available
ft = fasttext.load_model('cc.en.300.bin')  # load fasttext model

# fasttext embeddings are now ready to be used for sentence vectorization
print("dataset loaded, fasttext embeddings initialized")
