In [52]:
import pandas as pd

# from annoy import AnnoyIndex
# from concurrent.futures import ThreadPoolExecutor, as_completed
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

**Import Data from CSV**

In [72]:
csv_file_path = 'datasets/fakenewsnet.csv'
print("Loading data from CSV file...")
df = pd.read_csv(csv_file_path)
print(df.shape)
print("COLUMNS", df.columns.tolist())
df[df.columns[-1]]

Loading data from CSV file...
(20800, 4)
COLUMNS ['id', 'title', 'text', 'label']


0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

**Data Preprocessing**

In [54]:
# print("Imputing null values and further preprocessing...")

# null_imputation_dict = { 
#     'id': 'None',
#     'title': 'None',
#     'text': 'None',
#     'label': 'None'
#     }

# df = df.fillna(value=null_imputation_dict)

Imputing null values and further preprocessing...


**Encode Article to Embedding Vectors (Feature Extraction)** 

In [55]:
print("Encoding articles into vectors...")
# Load a pre-trained multilingual model
# model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Ref: https://www.sbert.net/docs/package_reference/SentenceTransformer.html
embeddings = model.encode(df['title'], batch_size=64, show_progress_bar=True)

print("Converting article embeddings into a DataFrame object...")
# Convert embeddings to a dataframe
embeddings_df = pd.DataFrame(embeddings)

# Join the original dataframe with the embeddings dataframe
df_article_embeddings = pd.concat([df[['id', 'label']], embeddings_df], axis=1)
print(df_article_embeddings)


Encoding articles into vectors...


Batches: 100%|██████████| 650/650 [02:07<00:00,  5.10it/s]


Converting article embeddings into a DataFrame object...
          id  label         0         1         2         3         4  \
0          0      1 -0.299883  0.164904 -0.102591 -0.143365  0.221248   
1          1      0  0.327236  0.058065 -0.173350  0.170811  0.351340   
2          2      1  0.091013  0.065344 -0.208350  0.016016  0.161568   
3          3      1 -0.110348  0.173647  0.106913 -0.238552  0.296870   
4          4      1 -0.228860 -0.077871 -0.293955  0.102327 -0.180181   
...      ...    ...       ...       ...       ...       ...       ...   
20795  20795      0  0.246922  0.546452  0.049918  0.033385  0.303243   
20796  20796      0 -0.041486 -0.261805 -0.047190 -0.160997  0.056983   
20797  20797      0 -0.184222 -0.546217  0.222024 -0.001448 -0.267883   
20798  20798      1 -0.142349 -0.226946 -0.025508  0.033776 -0.280920   
20799  20799      1  0.260994  0.156142 -0.400506 -0.539525  0.395133   

              5         6         7  ...       374       375      

**Prepare Training and Test Data**

In [56]:
# # Get index-articleID mapping to be used later
# idx_articleID_mapping = df_article_embeddings['articleID'].to_dict()

X = df_article_embeddings.drop(['id', 'label'], axis=1)  # Features (embeddings)
y = df_article_embeddings['label']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Select and Train Model**

Using Logistic Regression for Classification

In [57]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

Using KNN to find similar articles

In [58]:
knn_model = NearestNeighbors(n_neighbors=8, algorithm='ball_tree')
knn_model.fit(X)

**Test Model**

In [59]:
from sklearn.metrics import accuracy_score, classification_report

# Predict labels for the test set
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Detailed performance analysis
print(classification_report(y_test, y_pred))

Accuracy: 0.9105769230769231
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      2132
           1       0.89      0.93      0.91      2028

    accuracy                           0.91      4160
   macro avg       0.91      0.91      0.91      4160
weighted avg       0.91      0.91      0.91      4160



**Classify New Article**

In [69]:
new_article_text = "US Helicopters are flying over South-East Asia"

In [70]:
from sentence_transformers import SentenceTransformer
s_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Obtain the embedding
new_article_embedding = s_model.encode([new_article_text])[0]

def classify_article(article_embedding, lr_model):
    prediction = lr_model.predict([article_embedding])
    return prediction[0]

# Example usage assuming lr_model is your trained LogisticRegression classifier
predicted_label = classify_article(new_article_embedding, lr_model)
if predicted_label == 1:
    predicted_label = 'FALSE'
if predicted_label == 0:
    predicted_label = 'TRUE'

print(f"The article is predicted to be: {predicted_label}")



The article is predicted to be: FALSE


In [73]:
from sklearn.neighbors import NearestNeighbors

# Assuming you have fitted the NearestNeighbors model as shown earlier (knn_model)
# And new_article_embedding is the embedding for the new article

# Find the 5 nearest neighbors for the new article
distances, indices = knn_model.kneighbors([new_article_embedding])

similar_articles_info = df.iloc[indices[0]][['title', 'label']]

# Print the titles and labels of the most similar articles
result_fake = []
result_real = []
for _, row in similar_articles_info.iterrows():
    if row['label'] == 1 or row['label'] == 'FALSE':
        result_fake.append(row)
    else:
        result_real.append(row)
    
for row in result_real:
    print(f"{row['label']} | Title: {row['title']}")

print("\n")

for row in result_fake:
    print(f"{row['label']} | Title: {row['title']}")


0 | Title: Airbag Propellant Bound for Takata Factory Detonates en Route - The New York Times
0 | Title: Two Helicopters, FBI Bomb Squad Called Out At MILO Event - Breitbart
0 | Title: U.S. Flying Bombers Above Disputed South China Sea Irks ’Vigilant’ Beijing
0 | Title: U.S. Starts Deploying Thaad Antimissile System in South Korea, After North’s Tests - The New York Times


1 | Title: Drones de EE.UU. abaten a un líder de Al Qaeda en Afganistán - RT
1 | Title: Tourist helicopter crashes into house in Sochi. Video
1 | Title: ISIS shoots down Russian helicopter near Palmyra (VIDEO+PHOTOS)
1 | Title: La Corée du Nord annonce avoir envoyé un missile balistique de félicitation à Donald Trump >> Le Gorafi


**2/4**
- Dataset: FakeNewsNet
- Embedding: SentenceTransformer, only on title
- Classification: Logistic Regression
- Similar articles: KNN

Accuracy: 0.9105769230769231
              precision    recall  f1-score   support

        REAL       0.93      0.89      0.91      2132
        FAKE       0.89      0.93      0.91      2028

    accuracy                           0.91      4160
   macro avg       0.91      0.91      0.91      4160
weighted avg       0.91      0.91      0.91      4160

Comments: Sometimes articles can be classified as fake even though all nearest neighbours are real.