In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Load the four datasets

# For True.csv and Fake.csv, we'll use the 'text' column (if available), otherwise fallback to 'title'
true_df = pd.read_csv('News_Dataset/True.csv')
fake_df = pd.read_csv('News_Dataset/Fake.csv')

# For Real2.csv and Fake2.csv, they only contain 'title' so we'll use that
real2_df = pd.read_csv('addtional_training_data/Real2.csv')
fake2_df = pd.read_csv('addtional_training_data/Fake2.csv')

# Inspect which columns to use for each dataframe
print('True.csv columns:', true_df.columns.tolist())
print('Fake.csv columns:', fake_df.columns.tolist())
print('Real2.csv columns:', real2_df.columns.tolist())
print('Fake2.csv columns:', fake2_df.columns.tolist())

# Define a function to extract text content from a dataframe
# Priority: if 'text' column exists and is not empty, use it; else use 'title'
def extract_text(df):
    if 'text' in df.columns:
        # Fill NaN with empty strings
        return df['text'].fillna('')
    else:
        return df['title'].fillna('')

# Extract the texts
true_text = extract_text(true_df)
fake_text = extract_text(fake_df)
real2_text = extract_text(real2_df)
fake2_text = extract_text(fake2_df)

# Create labels for each dataset: 1 for true, 0 for fake
true_text = true_text.copy()
true_text = true_text.str.strip()

fake_text = fake_text.copy()
fake_text = fake_text.str.strip()

real2_text = real2_text.copy()
real2_text = real2_text.str.strip()

fake2_text = fake2_text.copy()
fake2_text = fake2_text.str.strip()

# Create DataFrames with text and label
# True news: from True.csv and Real2.csv => label 1
# Fake news: from Fake.csv and Fake2.csv => label 0

df_true = pd.DataFrame({'text': pd.concat([true_text, real2_text], ignore_index=True), 'label': 1})
df_fake = pd.DataFrame({'text': pd.concat([fake_text, fake2_text], ignore_index=True), 'label': 0})

# Combine into a single DataFrame
df_full = pd.concat([df_true, df_fake], ignore_index=True)

# Drop any rows where text is empty
df_full = df_full[df_full['text'].str.strip() != '']

df_full.reset_index(drop=True, inplace=True)

print('Combined dataset shape:', df_full.shape)
print(df_full.head(3))

# Now we build the model pipeline using CountVectorizer with bigrams and trigrams
# We set ngram_range=(2,3) to focus on bi- and trigrams

model_pipeline_full = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,3))),
    ('clf', LogisticRegression(max_iter=1000))
])

# Split the data into training and testing sets
X = df_full['text']
y = df_full['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline_full.fit(X_train, y_train)

# Predict on test set
y_pred = model_pipeline_full.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy on test set:', acc)
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Save the model
with open('webpage_truth_model.pkl', 'wb') as model_file:
    pickle.dump(model_pipeline_full, model_file)
print('Model saved as webpage_truth_model.pkl')

# Define a prediction function that also flags if the text is likely not a news article

def predict_webpage(text, model):
    """
    Predicts the truthfulness of a webpage text.
    Returns a tuple: (prediction, confidence, category)
    where prediction: 0 for false, 1 for true,
    confidence: probability associated with the predicted class,
    and category: 'News article' if prediction confidence is above threshold, else 'Possibly not a news article'
    """
    pred = model.predict([text])[0]
    proba = model.predict_proba([text])[0]
    confidence = proba[1] if pred == 1 else proba[0]
    # If maximum probability is below 0.6, flag as possibly not a news article
    if max(proba) < 0.6:
        category = 'Possibly not a news article'
    else:
        category = 'News article'
    return pred, confidence, category

# Test the prediction function
sample_text = "Breaking news: The government has announced a new policy today affecting millions. Experts say the move will transform the economy and lower taxes. More updates to follow."
pred, conf, cat = predict_webpage(sample_text, model_pipeline_full)
print('\
Prediction for sample webpage:')
print('Truth prediction:', 'True' if pred==1 else 'False')
print('Confidence:', conf)
print('Category:', cat)

True.csv columns: ['title', 'text', 'subject', 'date']
Fake.csv columns: ['title', 'text', 'subject', 'date']
Real2.csv columns: ['id', 'news_url', 'title', 'tweet_ids']
Fake2.csv columns: ['id', 'news_url', 'title', 'tweet_ids']
Combined dataset shape: (45323, 2)
                                                text  label
0  WASHINGTON (Reuters) - The head of a conservat...      1
1  WASHINGTON (Reuters) - Transgender people will...      1
2  WASHINGTON (Reuters) - The special counsel inv...      1
Accuracy on test set: 0.9817981246552675
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4659
           1       0.99      0.98      0.98      4406

    accuracy                           0.98      9065
   macro avg       0.98      0.98      0.98      9065
weighted avg       0.98      0.98      0.98      9065

Model saved as webpage_truth_model.pkl
Prediction for sample webpage:
Truth prediction: True
Confidence:

In [13]:
# Let's examine the structure of each file again
import pandas as pd

# Check True.csv
print("True.csv structure:")
true_df = pd.read_csv("News_Dataset/True.csv")
print("Columns:", true_df.columns.tolist())
print("Shape:", true_df.shape)
print("Sample data:")
print(true_df.head(2))
print("\
" + "-"*50 + "\
")

# Check Fake.csv
print("Fake.csv structure:")
fake_df = pd.read_csv("News_Dataset/Fake.csv")
print("Columns:", fake_df.columns.tolist())
print("Shape:", fake_df.shape)
print("Sample data:")
print(fake_df.head(2))
print("\
" + "-"*50 + "\
")

# Check Real2.csv
print("Real2.csv structure:")
real2_df = pd.read_csv("addtional_training_data/Real2.csv")
print("Columns:", real2_df.columns.tolist())
print("Shape:", real2_df.shape)
print("Sample data:")
print(real2_df.head(2))
print("\
" + "-"*50 + "\
")

# Check Fake2.csv
print("Fake2.csv structure:")
fake2_df = pd.read_csv("addtional_training_data/Fake2.csv")
print("Columns:", fake2_df.columns.tolist())
print("Shape:", fake2_df.shape)
print("Sample data:")
print(fake2_df.head(2))

# Let's also check the structure of our combined dataset
print("\
" + "-"*50 + "\
")
print("Combined dataset structure:")
# Load the saved model to get access to the vectorizer vocabulary
import pickle
with open('webpage_truth_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Get information about the vectorizer
vectorizer = model.named_steps['vect']
print("Number of features (n-grams):", len(vectorizer.get_feature_names_out()))
print("Sample features (first 10):", vectorizer.get_feature_names_out()[:10])

# Get information about the classifier
classifier = model.named_steps['clf']
print("Classifier type:", type(classifier).__name__)
print("Number of classes:", len(classifier.classes_))
print("Classes:", classifier.classes_) 

True.csv structure:
Columns: ['title', 'text', 'subject', 'date']
Shape: (21417, 4)
Sample data:
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
--------------------------------------------------
Fake.csv structure:
Columns: ['title', 'text', 'subject', 'date']
Shape: (23481, 4)
Sample data:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  Hou