In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
# Initialize a small, fast BERT model suitable for local machines
# This relates to the "NLP techniques" mentioned in your repo docs [3]
bert_model = SentenceTransformer('all-MiniLM-L6-v2') 

In [2]:
df = pd.read_csv('cleaned_liar_dataset.csv')

In [3]:

def map_labels(label):
    # Normalize text to lowercase to avoid mismatches
    label = str(label).lower() 
    if label in ['true', 'mostly-true', 'half-true']:
        return 1 # REAL
    else:
        return 0 # FAKE (Includes 'false', 'barely-true', 'pants-fire')


In [4]:
df['binary_label'] = df['label'].apply(map_labels) 

print("New Label Counts:")
print(df['binary_label'].value_counts())

New Label Counts:
binary_label
1    714
0    553
Name: count, dtype: int64


In [5]:
df.head()

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,target,binary_label
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview,1,1
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference,0,0
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.,0,0
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show,1,1
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video,0,0


In [6]:
X = df['statement'].astype(str) # The claim/statement
y = df['binary_label']            # The truth rating (True/False/Pants-fire)


In [7]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

In [8]:
# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [9]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [10]:
# Train the model
print("Training Random Forest...")
rf_model.fit(X_train, y_train)


Training Random Forest...


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# Make predictions
y_pred = rf_model.predict(X_test)


In [12]:
# Calculate Accuracy and F1 Score
accuracy = accuracy_score(y_test, y_pred)
# 'weighted' handles multi-class labels (e.g., mostly-true, half-true, false)
f1 = f1_score(y_test, y_pred, average='weighted') 

In [13]:

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.5591
F1 Score: 0.4960
