In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors  

In [2]:
csv_file_path = 'datasets/fakenewsnet.csv'
df = pd.read_csv(csv_file_path)
print(df.shape)
print("COLUMNS", df.columns.tolist())
df.iloc[0]

(20800, 4)
COLUMNS ['id', 'title', 'text', 'label']


id                                                       0
title    House Dem Aide: We Didn’t Even See Comey’s Let...
text     House Dem Aide: We Didn’t Even See Comey’s Let...
label                                                    1
Name: 0, dtype: object

In [3]:
# Imputing null values
null_imputation_dict = { 
    'id': 'None',
    'title': 'None',
    'text': 'None',
    'label': 'None'
    }
df = df.fillna(value=null_imputation_dict)



In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import numpy as np

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.3)
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Assuming 'label' is 1 for fake news and 0 for real news
chi2score = chi2(tfidf_matrix, df['label'])[0]

# Get feature names
features = vectorizer.get_feature_names_out()
labels = df['label'].unique()

# Find features with the highest Chi-squared scores
scores = zip(features, chi2score)
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Print top 20 indicative words
for feature, score in sorted_scores[:20]:
    print(feature, score)


mr 664.7311855839888
hillary 181.71362666986047
ms 171.98143100749388
clinton 121.47749412679153
breitbart 105.15389595366786
fbi 98.01118258834883
_____ 82.1605771274719
mrs 77.62665019596261
october 72.33414447609013
anti 57.80350303356132
emails 54.99174970732008
briefing 46.99698288174957
election 42.27699463332836
www 41.92520136465439
2017 41.572101852348936
november 41.38800794530661
podesta 41.262979235125485
http 40.01525862843738
wikileaks 37.73136808724635
sunday 36.902286266765685


In [9]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# # Ensure you have the necessary nltk resources downloaded
nltk.download('punkt')
nltk.download('stopwords')

def clean_and_tokenize(text):
    tokens = word_tokenize(text)
    # Remove punctuation and convert to lower case
    words = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return words


print("Cleaning and tokenizing text...")
# Apply this function to the 'text' column based on the label
df['cleaned_text'] = df['text'].apply(clean_and_tokenize)
fake_words = df[df['label'] == 1]['cleaned_text'].sum()
real_words = df[df['label'] == 0]['cleaned_text'].sum()
print("counting word frequencies...")
# Count word frequencies
fake_word_counts = Counter(fake_words)
real_word_counts = Counter(real_words)

# Find words that are more common in fake news than in real news
fake_indicator_words = {word: freq for word, freq in fake_word_counts.items() if freq > real_word_counts[word]}

# Sort words by the difference in frequency compared to real news
sorted_fake_indicators = sorted(fake_indicator_words.items(), key=lambda item: item[1] - real_word_counts[item[0]], reverse=True)
print(sorted_fake_indicators[:20])

[nltk_data] Downloading package punkt to /Users/newswav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/newswav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaning and tokenizing text...


KeyboardInterrupt: 