In [21]:
 import pandas as pd  
 from sklearn.model_selection import train_test_split  
 from sklearn.feature_extraction.text import TfidfVectorizer  
 from sklearn.linear_model import LogisticRegression  
 from sklearn.pipeline import Pipeline  
 from sklearn.metrics import accuracy_score, classification_report  
   
 # Load the dataset  
 df = pd.read_csv("News_Dataset/cleaned_news_headlines.csv", encoding="utf-8")  
   
 # Check the first few rows  
 print("Data Head:")  
 print(df.head())  
 
 # Assuming 'title' is the headline and 'label' is the binary indicator for fake/real  
 X = df["title"]  
 y = df["label"]  
   
 # Split the data into training and test sets  
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.03, random_state=25)  
   
 # Create a pipeline that vectorizes the text and then applies Logistic Regression  
 model_pipeline = Pipeline([  
     ('tfidf', TfidfVectorizer(stop_words="english")),  
     ('logreg', LogisticRegression())  
 ])  
   
 # Train the model  
 model_pipeline.fit(X_train, y_train)  
 print("Model training completed.")  
   
 # Make predictions on the test set  
 y_pred = model_pipeline.predict(X_test)  
   
 # Evaluate the model  
 acc = accuracy_score(y_test, y_pred)  
 print("Accuracy on test set: " + str(acc))  
 print("Classification Report:")  
 print(classification_report(y_test, y_pred))  

Data Head:
                                               title  label
0  german greens want last nuclear weapons withdr...      1
1  comedy gold on detroit news “willy” dumps his ...      0
2  trump will do everything to avoid nuclear war ...      1
3  altleft plans to hijack president trump’s az r...      0
4  fortyfour venezuelan activists released from p...      1
Model training completed.
Accuracy on test set: 0.9345637583892618
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       564
           1       0.93      0.95      0.94       628

    accuracy                           0.93      1192
   macro avg       0.94      0.93      0.93      1192
weighted avg       0.93      0.93      0.93      1192



In [25]:
import pickle

# Save the trained model pipeline to a file for later use
with open('news_fake_real_model.pkl', 'wb') as model_file:
    pickle.dump(model_pipeline, model_file)

print('Model saved as news_fake_real_model.pkl')

# Let's also create a simple function to demonstrate how to use the model for prediction
def predict_headline(headline, model):
    """
    Predict if a headline is fake or real news
    
    Args:
        headline (str): The news headline text
        model: The trained model pipeline
        
    Returns:
        prediction (int): 0 for fake, 1 for real
        probability (float): Probability of the prediction
    """
    # Get prediction (0 for fake, 1 for real)
    prediction = model.predict([headline])[0]
    
    # Get probability
    proba = model.predict_proba([headline])[0]
    probability = proba[1] if prediction == 1 else proba[0]
    
    return prediction, probability

# Test the function with a few examples
test_headlines = [
    "Breaking: President signs new climate change bill",
    "Aliens confirmed to be living among us, government admits",
    "Stock market reaches record high amid economic recovery"
]

print("\
Testing model with example headlines:")
for headline in test_headlines:
    pred, prob = predict_headline(headline, model_pipeline)
    label = "Real" if pred == 1 else "Fake"
    print(f"Headline: {headline}")
    print(f"Prediction: {label} (confidence: {prob:.2f})")
    print("-" * 50)

Model saved as news_fake_real_model.pkl
Testing model with example headlines:
Headline: Breaking: President signs new climate change bill
Prediction: Fake (confidence: 0.89)
--------------------------------------------------
Headline: Aliens confirmed to be living among us, government admits
Prediction: Fake (confidence: 0.77)
--------------------------------------------------
Headline: Stock market reaches record high amid economic recovery
Prediction: Real (confidence: 0.85)
--------------------------------------------------
