In [1]:
import pandas as pd
import string
import nltk
# nltk.download('punkt')
import yake
from yake import KeywordExtractor
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


In [2]:
# Load the two datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

In [3]:
# preprocess and tokenize dataset
fake_df['label'] = 0
true_df['label'] = 1
news_df = pd.concat([fake_df, true_df])
news_df['text'] = news_df['text'].str.lower()
news_df['text'] = news_df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
news_df['text'] = news_df['text'].apply(lambda x: word_tokenize(x))
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,"[donald, trump, just, couldn, t, wish, all, am...",News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,"[house, intelligence, committee, chairman, dev...",News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"[on, friday, it, was, revealed, that, former, ...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"[on, christmas, day, donald, trump, announced,...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,"[pope, francis, used, his, annual, christmas, ...",News,"December 25, 2017",0


### Key Word Extraction using Yake

In [None]:
# # Define the YAKE keyword extractor
# custom_kw_extractor = yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=5, features=None)
# # Extract the top 5 keywords for each news article
# top_5_keywords = []
# for i in range(news_df.shape[0]):
#     # Extract the keywords for the current news article
#     keywords = custom_kw_extractor.extract_keywords(' '.join([str(token) if isinstance(token, list) else token for token in news_df['text'][i]]))
#     #keywords = custom_kw_extractor.extract_keywords(' '.join(map(str, news_df['text'][i])))
#     #keywords = custom_kw_extractor.extract_keywords(' '.join(news_df['text'][i]))
#     top_keywords = [keyword[0] for keyword in keywords]
#     top_5_keywords.append(top_keywords)

# # Add the top 5 keywords to the dataframe
# news_df['top_5_keywords'] = top_5_keywords

# # Print the top 5 keywords for the first news article
# print(news_df['top_5_keywords'][0])


### CAR algorithm (Context Analysis for Retrieval) 

In [4]:
# Define the keywords for each class (true and fake)
true_keywords = ['true', 'fact', 'accurate', 'authentic']
fake_keywords = ['fake', 'false', 'untrue', 'misleading']

# Initialize the labels list
labels = []

# Loop through each news article
for article in news_df['text']:
    # Count the number of occurrences of each keyword in the article
    true_count = sum(article.count(keyword) for keyword in true_keywords)
    fake_count = sum(article.count(keyword) for keyword in fake_keywords)
    
    # Compare the counts to determine the label
    if true_count > fake_count:
        labels.append(1) # true
    else:
        labels.append(0) # fake

car_algo_news_df = news_df
car_algo_news_df['prediction'] = labels

# calculate accuracy
count = 0
correct = 0
for label in car_algo_news_df['label']:
    if label == labels[count]:
        correct += 1
    count += 1
print("Accuracy using CAR Method: ", correct/(len(labels)) * 100)
# Calculate the confusion matrix
cm = confusion_matrix(car_algo_news_df['label'], car_algo_news_df['prediction'])
# Display the confusion matrix
print("Confusion Matrix: ")
print(cm)

Accuracy using CAR Method:  43.00859726491158
Confusion Matrix: 
[[18250  5231]
 [20357  1060]]


### Naive Bayes Classifier

In [5]:
# Convert the list of tokens back into a string
news_df = pd.concat([fake_df, true_df])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(news_df['text'], news_df['label'], stratify=news_df['label'], test_size=0.3, random_state=42)

# Vectorize the text data using the CountVectorizer
vectorizer = CountVectorizer(min_df=2)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# Predict the labels for the test set
y_pred = nb_classifier.predict(X_test_counts)

### Naive Bayes Evaluation

In [6]:
# Calculate accuracy
# count = 0
# correct = 0
# # y_test.head()
# for label in y_test:
#     if label == y_pred[count]:
#         correct += 1
#     count += 1

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
# Calculate the precision of the model
precision = precision_score(y_test, y_pred)
# Calculate the recall of the model
recall = recall_score(y_test, y_pred)
# Calculate the F1-score of the model
f1 = f1_score(y_test, y_pred)
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("Confusion Matrix: ")
print(cm)


Accuracy: 0.95
Precision: 0.95
Recall: 0.96
F1-score: 0.95
Confusion Matrix: 
[[6702  343]
 [ 286 6139]]
