# Import Data from local

In [33]:
import pandas as pd
true_df = pd.read_csv(r"C:\Users\suriya\Downloads\archive (18)\True.csv")
fake_df = pd.read_csv(r"C:\Users\suriya\Downloads\archive (18)\Fake.csv")
true_df['label'] = 1
fake_df['label'] = 0
df = pd.concat([true_df, fake_df], ignore_index=True)
print(true_df.columns)
print(fake_df.columns)

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')
Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


# Data Preprocessing

In [34]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = text.strip()
    return text
df['text'] = df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suriya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature Extraction

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words=stopwords.words('english'))
X = tfidf.fit_transform(df['text']).toarray()
y = df['label']

# Model Selection And Training

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model Evaluation

In [37]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.989532293986637
Confusion Matrix:
[[4588   62]
 [  32 4298]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4650
           1       0.99      0.99      0.99      4330

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



# Hyperparameter Tuning

In [38]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
class_report_best = classification_report(y_test, y_pred_best)
print(f"Best Parameters: {best_params}")
print(f"Accuracy with Best Model: {accuracy_best}")
print("Confusion Matrix with Best Model:")
print(conf_matrix_best)
print("Classification Report with Best Model:")
print(class_report_best)

Best Parameters: {'C': 100, 'solver': 'liblinear'}
Accuracy with Best Model: 0.9953229398663697
Confusion Matrix with Best Model:
[[4627   23]
 [  19 4311]]
Classification Report with Best Model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4650
           1       0.99      1.00      1.00      4330

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



# Model Saving

In [39]:
import joblib
joblib.dump(best_model, 'fake_news_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# Predict the example text

In [53]:
import joblib
import re
from sklearn.feature_extraction.text import TfidfVectorizer
best_model = joblib.load('fake_news_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

## Label 0 is fake 
## Label 1 is Real

In [55]:
example_text = "AUSTIN, Texas/WASHINGTON (Reuters) - U.S. Republican Representative Blake Farenthold said he would not seek re-election in November, denying allegations of sexual harassment by former staffers but admitting he allowed an unprofessional culture to flourish in his Capitol Hill office. The 55-year-old congressman from Corpus Christi, Texas, made the announcement on Thursday, a week after the House Ethics Committee said it was investigating him over allegations of sexual harassment, discrimination and retaliation involving a former female staff member. The committee said it was also looking into whether Farenthold had made inappropriate statements to other members of his staff. In a videotaped statement on his campaign’s Facebook page, Farenthold said he was a political novice unprepared for his new responsibilities when he came to Washington for his first term in 2011. “I had no idea how to run a congressional office, and as a result, I allowed a workplace culture to take root in my office that was too permissive and decidedly unprofessional,” he said. Politico reported last week that the congressional Office of Compliance had paid $84,000 from a public fund on behalf of Farenthold for a sexual harassment claim. In 2014, his former communications director, Lauren Greene, filed a lawsuit accusing him of creating a hostile work environment, gender discrimination and retaliation, court documents showed. The two reached a confidential mediated agreement in 2015, according to a statement from Farenthold’s office that denied any wrongdoing by him. Reuters has been unable to verify the allegations against Farenthold, who said on Thursday that the charges were false.  “This issue has become a political distraction,” he said. “Quite simply, my constituents deserve better.” House of Representatives Speaker Paul Ryan called the allegations disconcerting, including reports outlining “unacceptable behaviors.”  “I think he’s made the right decision to retire,” Ryan said. Congress is reviewing its workplace policies on sexual harassment after a number of lawmakers have been accused of sexual misconduct in recent weeks amid a wave of such allegations against powerful men in entertainment, politics and the media.  Last week, Democratic Representative John Conyers and Republican Representative Trent Franks resigned, while Democratic Senator Al Franken said he would be stepping down in the coming weeks. " 
cleaned_example_text = clean_text(example_text)
text_vectorized = tfidf.transform([cleaned_example_text]).toarray()
prediction = best_model.predict(text_vectorized)
print(f"Prediction for the example text: {prediction[0]}")

Prediction for the example text: 1
