In [1]:
# Load the autoreload extension
%load_ext autoreload

# Set autoreload to automatically reload all modules
%autoreload 2

In [2]:
import sys
from pathlib import Path

# Add the src directory to sys.path
sys.path.append(str(Path().resolve() / "src"))

# Import util
import util

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Straakh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
import torch
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

In [4]:
DATA_PATH = "../data/truthseeker.csv"
df = pd.read_csv(DATA_PATH)
# Comment on on actual execution. For development, this small sample should run quickly on the CPU
df = df.sample(frac=0.01,  random_state=27)
df.drop(columns=["Unnamed: 0"], inplace=True)
df = df[~df["5_label_majority_answer"].isin(["NO MAJORITY", "Unrelated"])]

clean_cols = ["statement", "tweet"]
df[clean_cols] = df[clean_cols].applymap(util.clean_text)

print(df.shape)
df.head(2)

(1123, 8)


  df[clean_cols] = df[clean_cols].applymap(util.clean_text)


Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
58262,April Hunt,Unlike marijuana medical cannabis oil cannot g...,True,1.0,"medical canabis, cannot, high",You cannot open a medical cannabis dispensary...,Agree,Agree
40146,Louis Jacobson,There are more words in the IRS code than the...,True,1.0,"More words, IRS code, Bible",There are more words in the IRS code than the...,Mostly Disagree,Disagree


In [None]:
# Aggregating tweet-level features for each statement
aggregated_features = []
for statement, group in df.groupby("statement"):
    aggregated_sentiment, aggregated_embedding = util.extract_features(group["tweet"].tolist())
    aggregated_features.append([aggregated_sentiment, aggregated_embedding])

# Convert aggregated features into a new DataFrame
aggregated_df = pd.DataFrame(aggregated_features, columns=["Avg_Sentiment", "Avg_Embedding"])

# Flatten the embeddings into columns (assuming embedding size is 768)
embedding_cols = [f"Embed_{i}" for i in range(768)]
embedding_df = pd.DataFrame(np.vstack(aggregated_df["Avg_Embedding"].to_numpy()), columns=embedding_cols)

# Extract statement-level embeddings
statement_embeddings = util.get_statement_embeddings(df["statement"].unique())

In [10]:
# Combine all features (sentiment, tweet embeddings, statement embeddings)
X = pd.concat([
    aggregated_df[["Avg_Sentiment"]],
    embedding_df,
    pd.DataFrame(statement_embeddings, columns=[f"Stmt_Embed_{i}" for i in range(768)])
], axis=1)

# Define the true/false labels for statements
y = df.groupby("statement")["target"].first()

# Train a classifier to predict the truth of a statement
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

       False       0.74      0.61      0.67        41
        True       0.69      0.80      0.74        45

    accuracy                           0.71        86
   macro avg       0.71      0.70      0.70        86
weighted avg       0.71      0.71      0.71        86

